xref: /aosp_15_r20/external/intel-media-driver/media_driver/agnostic/common/cm/cm_kernel_rt.cpp (revision ba62d9d3abf0e404f2022b4cd7a85e107f48596f)
1 /*
2 * Copyright (c) 2007-2017, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 //!
23 //! \file      cm_kernel_rt.cpp
24 //! \brief     Contains CmKernelRT definitions.
25 //!
26 
27 #include "cm_kernel_rt.h"
28 
29 #include "cm_program.h"
30 #include "cm_device_rt.h"
31 #include "cm_surface_manager.h"
32 #include "cm_surface_2d_up_rt.h"
33 #include "cm_surface_3d_rt.h"
34 #include "cm_buffer_rt.h"
35 #include "cm_mov_inst.h"
36 #include "cm_kernel_data.h"
37 #include "cm_thread_space_rt.h"
38 #include "cm_state_buffer.h"
39 #include "cm_surface_vme.h"
40 #include "cm_debug.h"
41 #include "cm_surface_sampler8x8.h"
42 #include "cm_surface_sampler.h"
43 #include "cm_group_space.h"
44 #include "cm_surface_2d_rt.h"
45 #include "cm_sampler8x8_state_rt.h"
46 #include "cm_visa.h"
47 #include "cm_extension_creator.h"
48 #include "cm_execution_adv.h"
49 
50 #define GENERATE_GLOBAL_SURFACE_INDEX
51 
52 #define READ_FIELD_FROM_BUF( dst, type ) \
53     dst = *((type *) &buf[bytePosition]); \
54     bytePosition += sizeof(type);
55 
56 #define PER_ARG_SIZE_IN_DWORD 3
57 #define KERNEL_INFO_SIZE_IN_DWORD 4
58 
59 #define DW_ALIGNMENT( byte_address ) \
60     if( byte_address % 4 ) \
61     byte_address = ( byte_address / 4 + 1 ) * 4;
62 
63 #define GRF_ALIGNMENT( byte_address ) \
64     if( byte_address % 32 ) \
65     byte_address = ( byte_address / 32 + 1 ) * 32;
66 
67 // To check if surface type nType is equal to the surface type list in argument ...
68 #define CHECK_SURFACE_TYPE( nType, ... )  ( _CheckSurfaceType( nType, __VA_ARGS__, -1 ) )
69 
70 #define IsKernelArg(arg)    ((arg).unitCount == 1)
71 
72 // Warning : x must be uint32_t
73 #define SET_MEMORY_OBJECT_CONTROL(x, memCtl) \
74            x = ((uint16_t)(memCtl.mem_ctrl<< 8 | memCtl.mem_type << 4 | memCtl.age)) << 16 | (x);
75 
76 #define   ADD_INTO_VME_INDEX_ARRAY(value)     \
77     vmeIndexArray[vmeIndexArrayPosition] = value ;                 \
78     vmeIndexArrayPosition ++;
79 
80 #define   ADD_INTO_VME_CM_INDEX_ARRAY(value)  ; \
81     vmeCmIndexArray[vmeCmIndexArrayPosition] = value ;                 \
82     vmeCmIndexArrayPosition ++;
83 
84 typedef CM_ARG* PCM_ARG;
85 
86 #define CM_KERNEL_DATA_CLEAN                   0         // kernel data clean
87 #define CM_KERNEL_DATA_KERNEL_ARG_DIRTY        1         // per kernel arg dirty
88 #define CM_KERNEL_DATA_THREAD_ARG_DIRTY        (1 << 1)  // per thread arg dirty
89 #define CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY      (1 << 2)  // indirect payload data dirty
90 #define CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY (1 << 3)  // indirect payload data size changes
91 #define CM_KERNEL_DATA_GLOBAL_SURFACE_DIRTY    (1 << 4)  // global surface dirty
92 #define CM_KERNEL_DATA_THREAD_COUNT_DIRTY      (1 << 5)  // thread count dirty, reset() be called
93 #define cMKERNELDATASAMPLERBTIDIRTY            (1 << 6)  // sampler bti dirty
94 #define CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY      (1 << 7)       // threadgroupspace dirty
95 
Partition(PCM_ARG * args,int32_t p,int32_t r)96 int32_t Partition( PCM_ARG* args, int32_t p, int32_t r )
97 {
98     uint16_t x = args[p]->unitOffsetInPayload;
99     int32_t i = p - 1;
100     int32_t j = r + 1;
101     while( 1 )
102     {
103         do {
104             j --;
105         } while( args[j]->unitOffsetInPayload > x );
106 
107         do {
108             i ++;
109         } while( args[i]->unitOffsetInPayload < x );
110 
111         if( i < j )
112         {
113             PCM_ARG tmpP = args[i];
114             args[i] = args[j];
115             args[j] = tmpP;
116         }
117         else
118         {
119             return j;
120         }
121     }
122 }
123 
124 // Cannot be called directly! use macro CHECK_SURFACE_TYPE!
_CheckSurfaceType(int nType,...)125 bool _CheckSurfaceType( int nType, ... )
126 {
127     bool match = false;
128     va_list ap;
129     va_start( ap, nType );
130     int type = 0;
131 
132     while ( ( type = va_arg( ap, int ) ) >= 0 )
133     {
134         if( type == nType )
135         {
136             match = true;
137             break;
138         }
139     }
140     va_end(ap);
141 
142     return match;
143 }
144 
QuickSort(PCM_ARG * args,int32_t p,int32_t r)145 void QuickSort( PCM_ARG* args, int32_t p, int32_t r )
146 {
147     if( p < r )
148     {
149         int32_t q = Partition( args, p, r );
150         QuickSort( args, p, q );
151         QuickSort( args, q + 1, r );
152     }
153 }
154 
155 namespace CMRT_UMD
156 {
157 static bool bCmMovInstRegistered = CmExtensionCreator<CmMovInstConstructor>::RegisterClass<CmMovInstConstructor>();
158 //*-----------------------------------------------------------------------------
159 //| Purpose:   Create object for mov instructions
160 //|            instructions will be copied into DstMem
161 //*-----------------------------------------------------------------------------
ConstructObjMovs(uint32_t dstOffset,uint32_t srcOffset,uint32_t size,CmDynamicArray & movInsts,uint32_t index,bool isBdw,bool isHwDebug)162 uint32_t CmMovInstConstructor::ConstructObjMovs(uint32_t dstOffset, uint32_t srcOffset, uint32_t size, CmDynamicArray &movInsts, uint32_t index, bool isBdw, bool isHwDebug)
163 {
164     return MovInst_RT::CreateMoves(dstOffset, srcOffset, size, movInsts, index, isBdw, isHwDebug);
165 }
166 
167 //*-----------------------------------------------------------------------------
168 //| Purpose:     Create CM Kernel
169 //| Arguments :
170 //|               device        [in]    Pointer to device
171 //|               program      [in]    Pointer to cm Program
172 //|               kernelName    [in]    Name of kernel
173 //|               kernelId      [in]    Kernel's ID
174 //|               kernel       [in/out]    Reference Pointer to CM Kernel
175 //|               options       [in]    jitter, or non-jitter
176 //| Returns:    Result of the operation.
177 //*-----------------------------------------------------------------------------
Create(CmDeviceRT * device,CmProgramRT * program,const char * kernelName,uint32_t kernelIndex,uint32_t kernelSeqNum,CmKernelRT * & kernel,const char * options)178 int32_t CmKernelRT::Create(CmDeviceRT *device,
179                            CmProgramRT *program,
180                            const char *kernelName,
181                            uint32_t kernelIndex,
182                            uint32_t kernelSeqNum,
183                            CmKernelRT* &kernel,
184                            const char *options)
185 {
186     int32_t result = CM_SUCCESS;
187     CM_HAL_STATE * state  = device ? ((PCM_CONTEXT_DATA)device->GetAccelData())->cmHalState : nullptr;
188 
189     if (device)
190     {
191         if (state && state->advExecutor)
192         {
193             kernel = state->advExecutor->CreateKernelRT(device, program, kernelIndex, kernelSeqNum);
194         }
195         else
196         {
197             kernel = new (std::nothrow) CmKernelRT(device, program, kernelIndex, kernelSeqNum);
198         }
199     }
200 
201     if( kernel )
202     {
203         if (device)
204         {
205             device->m_memObjectCount.kernelCount++;
206         }
207         kernel->Acquire();
208         result = kernel->Initialize( kernelName, options );
209         if( result != CM_SUCCESS )
210         {
211             CmKernelRT::Destroy( kernel, program);
212             return result;
213         }
214     }
215     else
216     {
217         CM_ASSERTMESSAGE("Error: Failed to create CmKernel due to out of system memory.");
218         return CM_OUT_OF_HOST_MEMORY;
219     }
220     if (options)
221     {
222         if (strcmp(options, "PredefinedGPUCopyKernel") == 0)
223         {
224             kernel->m_blCreatingGPUCopyKernel = true;
225         }
226         else
227         {
228             kernel->m_blCreatingGPUCopyKernel = false;
229         }
230     }
231 
232 #if USE_EXTENSION_CODE
233     if (device)
234         result = kernel->InitForGTPin(device, program, kernel);
235 #endif
236 
237     return result;
238 }
239 
240 //*-----------------------------------------------------------------------------
241 //| Purpose:    Destory Kernel
242 //| Returns:    Result of the operation.
243 //*-----------------------------------------------------------------------------
Destroy(CmKernelRT * & kernel,CmProgramRT * & program)244 int32_t CmKernelRT::Destroy( CmKernelRT* &kernel, CmProgramRT *&program )
245 {
246     uint32_t refCount = kernel->SafeRelease();
247     if (refCount == 0)
248     {
249         kernel = nullptr;
250     }
251 
252     refCount = program->SafeRelease();
253     if (refCount == 0)
254     {
255         program = nullptr;
256     }
257     return CM_SUCCESS;
258 }
259 
260 //*-----------------------------------------------------------------------------
261 //| Purpose:    Acuqire Kernel: increment refcount
262 //| Returns:    Result of the operation.
263 //*-----------------------------------------------------------------------------
Acquire(void)264 int32_t CmKernelRT::Acquire( void)
265 {
266     m_refcount ++;
267     return m_refcount;
268 }
269 
270 //*-----------------------------------------------------------------------------
271 //| Purpose:    SafeRelease Kernel: Delete the instance
272 //| Returns:    Result of the operation.
273 //*-----------------------------------------------------------------------------
SafeRelease(void)274 int32_t CmKernelRT::SafeRelease( void)
275 {
276     --m_refcount;
277     if (m_refcount == 0)
278     {
279         m_device->m_memObjectCount.kernelCount--;
280         PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
281         PCM_HAL_STATE state = cmData->cmHalState;
282         if (state->dshEnabled)
283         {
284             state->pfnDSHUnregisterKernel(state, m_id);
285         }
286         delete this;
287         return 0;
288     }
289     return m_refcount;
290 }
291 
292 //*-----------------------------------------------------------------------------
293 //| Purpose:    Kernel constructor
294 //| Returns:    Result of the operation.
295 //*-----------------------------------------------------------------------------
CmKernelRT(CmDeviceRT * device,CmProgramRT * program,uint32_t kernelIndex,uint32_t kernelSeqNum)296 CmKernelRT::CmKernelRT(CmDeviceRT *device,
297                        CmProgramRT *program,
298                        uint32_t kernelIndex,
299                        uint32_t kernelSeqNum):
300     m_device( device ),
301     m_surfaceMgr( nullptr ),
302     m_program( program ),
303     m_options( nullptr ),
304     m_binary( nullptr ),
305     m_binaryOrig(nullptr),
306     m_binarySize(0),
307     m_binarySizeOrig(0),
308     m_threadCount( 0 ),
309     m_lastThreadCount( 0 ),
310     m_sizeInCurbe( 0 ),
311     m_sizeInPayload( 0 ),
312     m_argCount( 0 ),
313     m_args( nullptr ),
314     m_kernelInfo(nullptr),
315     m_kernelIndexInProgram( CM_INVALID_KERNEL_INDEX ),
316     m_curbeEnabled( true ),
317     m_nonstallingScoreboardEnabled(false),
318     m_dirty( CM_KERNEL_DATA_CLEAN ),
319     m_lastKernelData( nullptr ),
320     m_lastKernelDataSize( 0 ),
321     m_indexInTask(0),
322     m_threadSpaceAssociated(false),
323     m_perThreadArgExists(false),
324     m_perKernelArgExists( false ),
325     m_threadSpace( nullptr ),
326     m_adjustScoreboardY( 0 ),
327     m_lastAdjustScoreboardY( 0 ),
328     m_blCreatingGPUCopyKernel( false),
329     m_usKernelPayloadDataSize( 0 ),
330     m_kernelPayloadData( nullptr ),
331     m_usKernelPayloadSurfaceCount( 0 ),
332     m_samplerBtiCount( 0 ),
333     m_refcount(0),
334     m_halMaxValues( nullptr ),
335     m_halMaxValuesEx( nullptr ),
336     m_surfaceArray(nullptr),
337     m_threadGroupSpace( nullptr ),
338     m_vmeSurfaceCount( 0 ),
339     m_maxSurfaceIndexAllocated(0),
340     m_barrierMode(CM_LOCAL_BARRIER),
341     m_isClonedKernel(false),
342     m_cloneKernelID(0),
343     m_hasClones( false ),
344     m_stateBufferBounded( CM_STATE_BUFFER_NONE ),
345     m_movInstConstructor(nullptr)
346 {
347     program->Acquire();
348     m_program = program;
349 
350     device->GetSurfaceManager(m_surfaceMgr);
351 
352     m_id = kernelSeqNum; // Unique number for each kernel. This ID is used in Batch buffer.
353     m_id <<= 32;
354     m_kernelIndex = kernelIndex;
355 
356     for (int i = 0; i < CM_GLOBAL_SURFACE_NUMBER; i++)
357     {
358         m_globalSurfaces[i] = nullptr;
359         m_globalCmIndex[i] = 0;
360     }
361 
362     m_blhwDebugEnable = program->IsHwDebugEnabled();
363 
364     CmSafeMemSet(m_pKernelPayloadSurfaceArray, 0, sizeof(m_pKernelPayloadSurfaceArray));
365     CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, sizeof(m_IndirectSurfaceInfoArray));
366     CmSafeMemSet( m_samplerBtiEntry, 0, sizeof( m_samplerBtiEntry ) );
367 
368     if (m_samplerBtiCount > 0)
369     {
370         CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
371         m_samplerBtiCount = 0;
372     }
373 
374     ResetKernelSurfaces();
375 }
376 
377 //*-----------------------------------------------------------------------------
378 //| Purpose:    Destructor of Class CmKernel
379 //| Returns:    None.
380 //*-----------------------------------------------------------------------------
~CmKernelRT(void)381 CmKernelRT::~CmKernelRT( void )
382 {
383     MosSafeDeleteArray(m_options);
384 
385     DestroyArgs();
386 
387     if(m_lastKernelData)
388     {
389         CmKernelData::Destroy( m_lastKernelData );
390     }
391 
392     if( m_device->CheckGTPinEnabled() && !m_blCreatingGPUCopyKernel)
393     {
394         MosSafeDeleteArray(m_binary);
395     }
396 
397     if( CM_INVALID_KERNEL_INDEX != m_kernelIndexInProgram )
398     {
399         m_program->ReleaseKernelInfo(m_kernelIndexInProgram);
400     }
401 
402     for(int i=0; i< CM_GLOBAL_SURFACE_NUMBER; i++)
403     {
404         SurfaceIndex *surfIndex = m_globalSurfaces[i];
405         MosSafeDelete(surfIndex);
406     }
407 
408     MosSafeDeleteArray(m_kernelPayloadData);
409     MosSafeDeleteArray(m_surfaceArray);
410     MosSafeDelete(m_movInstConstructor);
411 }
412 
413 //*-----------------------------------------------------------------------------
414 //| Purpose:    Initialize CM kernel
415 //| Returns:    Result of the operation.
416 //*-----------------------------------------------------------------------------
Initialize(const char * kernelName,const char * options)417 int32_t CmKernelRT::Initialize( const char* kernelName, const char* options )
418 {
419     if( kernelName == nullptr )
420     {
421         CM_ASSERTMESSAGE("Error: Kernel name is null.");
422         return CM_NULL_POINTER;
423     }
424 
425     size_t length = strnlen( kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE );
426     if( length >= CM_MAX_KERNEL_NAME_SIZE_IN_BYTE  )
427     {
428         CM_ASSERTMESSAGE("Error: Kernel name size is too long.");
429         return CM_FAILURE;
430     }
431 
432     uint32_t kernelCount = 0;
433     m_program->GetKernelCount( kernelCount );
434 
435     CM_KERNEL_INFO* kernelInfo = nullptr;
436     uint32_t i = 0;
437     for( i = 0; i < kernelCount; i ++ )
438     {
439         m_program->GetKernelInfo( i, kernelInfo );
440         if( !kernelInfo )
441         {
442             CM_ASSERTMESSAGE("Error: Invalid kernel info.");
443             return CM_NULL_POINTER;
444         }
445         if( strcmp( kernelInfo->kernelName, kernelName ) == 0 )
446         {
447             break;
448         }
449     }
450 
451     if( i == kernelCount )
452     {
453         CM_ASSERTMESSAGE("Error: Invalid kernel count.");
454         return CM_FAILURE;
455     }
456 
457     m_device->GetHalMaxValues( m_halMaxValues, m_halMaxValuesEx);
458 
459     m_program->AcquireKernelInfo(i);
460     m_kernelInfo = kernelInfo;
461     m_kernelIndexInProgram = i;
462 
463     if( options )
464     {
465         size_t length = strnlen( options, CM_MAX_OPTION_SIZE_IN_BYTE );
466         if(length >= CM_MAX_OPTION_SIZE_IN_BYTE)
467         {
468             CM_ASSERTMESSAGE("Error: Option string is too long.");
469             return CM_INVALID_ARG_VALUE;
470         }
471         else
472         {
473             m_options = MOS_NewArray(char, (length+1));
474             if( !m_options )
475             {
476                 CM_ASSERTMESSAGE("Error: Out of system memory.");
477                 return CM_OUT_OF_HOST_MEMORY;
478 
479             }
480             CmSafeMemCopy( m_options, options, length);
481             m_options[ length ] = '\0';
482 
483             char* tmp = strstr( m_options, "nocurbe" );
484             if( tmp )
485             {
486                 m_curbeEnabled = false;
487             }
488         }
489     }
490 
491     m_nonstallingScoreboardEnabled = true;
492 
493     void* commonISACode = nullptr;
494     uint32_t commonISACodeSize = 0;
495     m_program->GetCommonISACode(commonISACode, commonISACodeSize);
496     if ((commonISACode == nullptr) || (commonISACodeSize <= 0))
497     {
498         CM_ASSERTMESSAGE("Error: Invalid VISA.");
499         return CM_INVALID_COMMON_ISA;
500     }
501 
502     bool useVisaApi = true;
503     vISA::ISAfile *isaFile = nullptr;
504     vISA::KernelBody *kernelBody = nullptr;
505 
506     auto getVersionAsInt = [](int major, int minor) { return major * 100 + minor; };
507     if (getVersionAsInt(m_program->m_cisaMajorVersion, m_program->m_cisaMinorVersion) < getVersionAsInt(3, 2))
508     {
509         useVisaApi = false;
510     }
511     else
512     {
513         isaFile = m_program->getISAfile();
514         if (!isaFile)
515         {
516             CM_ASSERTMESSAGE("Error: Invalid VISA.");
517             return CM_INVALID_COMMON_ISA;
518         }
519         kernelBody = isaFile->getKernelsData().at(m_kernelIndexInProgram);
520     }
521 
522     uint8_t *buf = (uint8_t*)commonISACode;
523     uint32_t bytePosition = m_kernelInfo->kernelIsaOffset;
524 
525     uint32_t kernelInfoRefCount = 0;
526     m_program->GetKernelInfoRefCount(m_kernelIndexInProgram, kernelInfoRefCount);
527     if (kernelInfoRefCount <= 2)    //Only read for 1st time Kernel creation, later we reuse them
528     {
529         if (useVisaApi)
530         {
531             m_kernelInfo->globalStringCount = kernelBody->getStringCount();
532         }
533         {
534             READ_FIELD_FROM_BUF(m_kernelInfo->globalStringCount, unsigned short);
535         }
536 
537         m_kernelInfo->globalStrings = (const char**) malloc( m_kernelInfo->globalStringCount * sizeof(char*) );
538         if(m_kernelInfo->globalStrings  == nullptr)
539         {
540             CM_ASSERTMESSAGE("Error: Out of system memory.");
541             return CM_OUT_OF_HOST_MEMORY;
542         }
543         CmSafeMemSet(m_kernelInfo->globalStrings, 0, m_kernelInfo->globalStringCount * sizeof(char*) );
544 
545         if (useVisaApi)
546         {
547             int i = 0;
548             for (vISA::StringPool *globalString : kernelBody->getStringPool())
549             {
550                 size_t stringLength = std::strlen(globalString->getString());
551                 char *string = (char*)malloc(stringLength + 1);
552                 if (string == nullptr)
553                 {
554                     CM_ASSERTMESSAGE("Error: Out of system memory.");
555                     return CM_OUT_OF_HOST_MEMORY;
556                 }
557                 CmSafeMemCopy(string, globalString->getString(), stringLength);
558                 string[stringLength] = '\0';
559                 m_kernelInfo->globalStrings[i] = string;
560                 i++;
561             }
562         }
563         else
564         {
565             for (int i = 0; i < (int)m_kernelInfo->globalStringCount; i++)
566             {
567                 char* string = (char*)malloc(CM_MAX_KERNEL_STRING_IN_BYTE + 1);
568                 if (string == nullptr)
569                 {
570                     CM_ASSERTMESSAGE("Error: Out of system memory.");
571                     return CM_OUT_OF_HOST_MEMORY;
572                 }
573                 int j = 0;
574                 while (buf[bytePosition] != '\0' && j < CM_MAX_KERNEL_STRING_IN_BYTE) {
575                     string[j++] = buf[bytePosition++];
576                 }
577                 string[j] = '\0';
578                 bytePosition++;
579                 m_kernelInfo->globalStrings[i] = string;
580             }
581         }
582     }
583 
584     uint32_t count = 0;
585     if (useVisaApi)
586     {
587         count = kernelBody->getNumInputs();
588     }
589     else
590     {
591         bytePosition = m_kernelInfo->inputCountOffset;
592 
593         uint8_t countTemp = 0;
594         READ_FIELD_FROM_BUF(countTemp, uint8_t);
595         count = countTemp;
596     }
597 
598     if( count > m_halMaxValues->maxArgsPerKernel )
599     {
600         CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
601         return CM_EXCEED_KERNEL_ARG_AMOUNT;
602     }
603 
604     m_args = MOS_NewArray(CM_ARG, count);
605     if( (!m_args) && (count != 0) )
606     {
607         CM_ASSERTMESSAGE("Error: Out of system memory.");
608         MosSafeDeleteArray(m_options);
609         return CM_OUT_OF_HOST_MEMORY;
610     }
611     CmSafeMemSet(m_args, 0, sizeof(CM_ARG) * count);
612     m_argCount  = count;
613 
614     uint32_t preDefinedSurfNum;
615     if ( (m_program->m_cisaMajorVersion > 3) || ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion >=1)) )  //CISA 3.1 +
616     {
617         preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_3_1;
618     }
619     else if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion == 0))
620     {
621         preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_2_1;
622     }
623     else //CISA 2.0
624     {
625         preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_2;
626     }
627 
628     uint32_t argSize = 0;
629 
630     for (uint32_t i = 0; i < m_argCount; i++)
631     {
632         vISA::InputInfo *inputInfo = nullptr;
633         uint8_t kind = 0;
634 
635         if (useVisaApi)
636         {
637             inputInfo = kernelBody->getInputInfo()[i];
638             kind = inputInfo->getKind();
639         }
640         else
641         {
642             READ_FIELD_FROM_BUF(kind, uint8_t);
643         }
644 
645         if (kind == 0x2) // compiler value for surface
646         {
647             kind = ARG_KIND_SURFACE;
648                 // runtime value for surface. surface will be further classified to 1D/2D/3D
649         }
650         else if (kind == 0x3) // compiler value for vme index
651         {
652             kind = ARG_KIND_VME_INDEX;
653         }
654         else if (kind == 0x8)
655         {
656             kind = ARG_KIND_IMPLICT_LOCALSIZE;
657             m_args[i].isSet = true;
658             m_args[i].unitCount = 1;
659         }
660         else if (kind == 0x10) {
661             kind = ARG_KIND_IMPLICT_GROUPSIZE;
662             m_args[i].isSet = true;
663             m_args[i].unitCount = 1;
664         }
665         else if (kind == 0x18) {
666             kind = ARG_KIND_IMPLICIT_LOCALID;
667             m_args[i].isSet = true;
668             m_args[i].unitCount = 1;
669             m_perKernelArgExists = true;  //only VISA3.3+, can come here, so, no matter it is there any explicit arg, implicit arg exits
670         }
671         else if (kind == 0x2A) {
672             kind = ARG_KIND_SURFACE_2D_SCOREBOARD;
673         }
674         else if (kind == 0x20) {
675             kind = ARG_KIND_GENERAL_DEPVEC;
676         }
677         else if (kind == 0x30) {
678             kind = ARG_KIND_GENERAL_DEPCNT;
679         }
680         else if (kind == 0x80) {
681             // IMP_PSEUDO_INPUT = 0x80 is pseudo input. All inputs after this
682             // will be ignored by CMRT without checking and payload copied.
683             // This resizes the argument count to achieve this.
684             m_argCount = i;
685             break;
686         }
687 
688         m_args[i].unitKind = kind;
689         m_args[i].unitKindOrig = kind;
690 
691         if (kind == ARG_KIND_SURFACE && m_kernelInfo->surfaceCount)
692         {
693             m_args[i].surfaceKind = DATA_PORT_SURF;
694         }
695 
696         if (useVisaApi)
697         {
698             m_args[i].unitOffsetInPayload = inputInfo->getOffset();
699             m_args[i].unitOffsetInPayloadOrig = inputInfo->getOffset();
700 
701             m_args[i].unitSize = inputInfo->getSize();
702             m_args[i].unitSizeOrig = inputInfo->getSize();
703         }
704         else
705         {
706             uint32_t varID;
707             READ_FIELD_FROM_BUF(varID, uint16_t);
708 
709             uint16_t tmpW;
710             READ_FIELD_FROM_BUF(tmpW, uint16_t);
711             m_args[i].unitOffsetInPayload = tmpW;
712             m_args[i].unitOffsetInPayloadOrig = tmpW;
713 
714             READ_FIELD_FROM_BUF(tmpW, uint16_t);
715             m_args[i].unitSize = tmpW;
716             m_args[i].unitSizeOrig = tmpW;
717         }
718 
719         argSize += m_args[i].unitSize;
720     }
721     //////////////////////////////////////////////////////////////////////////
722 
723     if (kernelInfoRefCount <= 2)    //Only read for 1st time Kernel creation, later we reuse them
724     {
725         uint16_t attributeCount = 0;
726         if (useVisaApi)
727         {
728             attributeCount = kernelBody->getAttributeCount();
729         }
730         else
731         {
732             /////////////////////////////////////////////////////////////////////////
733             // Get pre-defined kernel attributes, Start
734             //skipping size and entry
735             bytePosition += 8;
736 
737             READ_FIELD_FROM_BUF(attributeCount, uint16_t);
738         }
739 
740         for (int i = 0; i < attributeCount; i++)
741         {
742             vISA::AttributeInfo *attribute = nullptr;
743             uint32_t nameIndex = 0;
744             uint8_t size = 0;
745 
746             if (useVisaApi)
747             {
748                 attribute = kernelBody->getAttributeInfo()[i];
749                 nameIndex = attribute->getName();
750                 size = attribute->getSize();
751             }
752             else
753             {
754                 READ_FIELD_FROM_BUF(nameIndex, uint16_t);
755                 READ_FIELD_FROM_BUF(size, uint8_t);
756             }
757 
758             if( strcmp( m_kernelInfo->globalStrings[nameIndex], "AsmName" ) == 0 )
759             {
760                 if (useVisaApi)
761                 {
762                     CmSafeMemCopy(m_kernelInfo->kernelASMName, attribute->getValue(), size);
763                 }
764                 else
765                 {
766                     CmSafeMemCopy(m_kernelInfo->kernelASMName, &buf[bytePosition], size);
767                     bytePosition += size;
768                 }
769             }
770             else if (strcmp( m_kernelInfo->globalStrings[nameIndex], "SLMSize" ) == 0)
771             {
772                 if (useVisaApi)
773                 {
774                     m_kernelInfo->kernelSLMSize = attribute->getValue()[0];
775                 }
776                 else
777                 {
778                     READ_FIELD_FROM_BUF(m_kernelInfo->kernelSLMSize, uint8_t);
779                 }
780 
781                 /* Notes by Stony@2014-04-09
782                  * <=CISA3.1: the size is number of 4KB
783                  * > CISA3.1: the size is number of 1KB
784                  * Here convert it to the number of 1KB if <=CISA 3.1
785                  */
786                 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion <= 1))
787                 {
788                     m_kernelInfo->kernelSLMSize = m_kernelInfo->kernelSLMSize * 4;
789                 }
790 
791                 // align to power of 2
792                 uint32_t v = m_kernelInfo->kernelSLMSize;
793                 v--;
794                 v |= v >> 1;
795                 v |= v >> 2;
796                 v |= v >> 4;
797                 v |= v >> 8;
798                 v |= v >> 16;
799                 v++;
800                 m_kernelInfo->kernelSLMSize = ( uint8_t )v;
801             }
802             else if (strcmp(m_kernelInfo->globalStrings[nameIndex], "NoBarrier") == 0)
803             {
804                 m_kernelInfo->blNoBarrier = true;
805                 if (!useVisaApi)
806                 {
807                     bytePosition += size;
808                 }
809             }
810             else
811             {
812                 if (!useVisaApi)
813                 {
814                     bytePosition += size;
815                 }
816             }
817         }
818         if (m_kernelInfo->blNoBarrier && m_options && strstr(m_options, "-hasBarrier"))
819         {
820             m_kernelInfo->blNoBarrier = false;
821         }
822     }
823 
824     if(argSize > m_halMaxValues->maxArgByteSizePerKernel)
825     {
826         CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
827         return CM_EXCEED_KERNEL_ARG_SIZE_IN_BYTE;
828     }
829 
830     buf = (uint8_t*)commonISACode;
831 
832     if(m_program->IsJitterEnabled())
833     {
834         //m_JitterEnable = true;
835         char *programOptions;
836         m_program->GetKernelOptions(programOptions);
837         //if no options or same options, copy load program's binary. else re-jitter
838         {
839             m_binary = (char *)m_kernelInfo->jitBinaryCode;
840             m_binarySize = m_kernelInfo->jitBinarySize;
841             m_kernelInfo->origBinary = m_kernelInfo->jitBinaryCode;
842             m_kernelInfo->origBinarySize = m_kernelInfo->jitBinarySize;
843         }
844     }
845     else
846     {
847         char* binary = (char*)(buf + m_kernelInfo->genxBinaryOffset );
848 
849         //No copy, point to the binary offset in CISA code.
850         m_binary = binary;
851         m_binarySize = m_kernelInfo->genxBinarySize;
852 
853         m_kernelInfo->origBinary = binary;
854         m_kernelInfo->origBinarySize = m_kernelInfo->genxBinarySize;
855     }
856 
857     if (m_kernelInfo->blNoBarrier)
858     {
859         m_barrierMode = CM_NO_BARRIER;
860     }
861 
862     m_movInstConstructor = CmExtensionCreator<CmMovInstConstructor>::CreateClass();
863     if (m_movInstConstructor == nullptr)
864     {
865         CM_ASSERTMESSAGE("Error: Failed to allocate movInstConstructor due to out of system memory.");
866         return CM_OUT_OF_HOST_MEMORY;
867     }
868 
869     CmNotifierGroup *notifiers = m_device->GetNotifiers();
870     if (notifiers != nullptr)
871     {
872         notifiers->NotifyKernelCreated(this);
873     }
874 
875     return CM_SUCCESS;
876 }
877 
878 //*-----------------------------------------------------------------------------
879 //! A CmKernel can run in multiple threads concurrently. This
880 //! fucntion is to set the number of threads.
881 //! INPUT:
882 //!     number of threads
883 //! OUTPUT:
884 //!     CM_SUCCESS or
885 //!     CM_INVALID_ARG_VALUE if the number is larger than CmKernel's capacity
886 //*-----------------------------------------------------------------------------
SetThreadCount(uint32_t count)887 CM_RT_API int32_t CmKernelRT::SetThreadCount(uint32_t count )
888 {
889     INSERT_API_CALL_LOG(GetHalState());
890     // Check per kernel, per task check will be at enqueue time
891     if ((int)count <= 0)
892         return CM_INVALID_ARG_VALUE;
893 
894     if (m_threadSpace == nullptr)
895     {
896         if (m_threadCount)
897         {
898             // Setting threadCount twice with different values will cause reset of kernels
899             if (m_threadCount != count)
900             {
901                 Reset();
902                 m_threadCount = count;
903                 m_dirty |= CM_KERNEL_DATA_THREAD_COUNT_DIRTY;
904             }
905         }
906         else // first time
907         {
908             m_threadCount = count;
909         }
910     }
911     return CM_SUCCESS;
912 }
913 
GetThreadCount(uint32_t & count)914 int32_t CmKernelRT::GetThreadCount(uint32_t& count )
915 {
916     count = m_threadCount;
917     return CM_SUCCESS;
918 }
919 
GetKernelSurfaces(bool * & surfArray)920 int32_t CmKernelRT::GetKernelSurfaces(bool  *&surfArray)
921 {
922     surfArray = m_surfaceArray;
923     return CM_SUCCESS;
924 }
925 
ResetKernelSurfaces()926 int32_t CmKernelRT::ResetKernelSurfaces()
927 {
928     uint32_t surfacePoolSize = m_surfaceMgr->GetSurfacePoolSize();
929     if (!m_surfaceArray)
930     {
931         m_surfaceArray = MOS_NewArray(bool, surfacePoolSize);
932         if (!m_surfaceArray)
933         {
934             CM_ASSERTMESSAGE("Error: Failed to rest kernel surfaces due to out of system memory.");
935             return CM_OUT_OF_HOST_MEMORY;
936         }
937     }
938     CmSafeMemSet( m_surfaceArray, 0, surfacePoolSize * sizeof( bool ) );
939 
940     return CM_SUCCESS;
941 }
942 
943 //*-----------------------------------------------------------------------------
944 //| Purpose:    Get CmSurface from surface manager.
945 //|             Use "value + indexSurfaceArray" to locate its surfaceIndex
946 //| Returns:    CmSurface. Null if not found
947 //*-----------------------------------------------------------------------------
GetSurfaceFromSurfaceArray(SurfaceIndex * value,uint32_t indexSurfaceArray)948 CmSurface* CmKernelRT::GetSurfaceFromSurfaceArray( SurfaceIndex* value, uint32_t indexSurfaceArray)
949 {
950     int32_t hr                          = CM_SUCCESS;
951     CmSurface *surface           = nullptr;
952     SurfaceIndex* surfaceIndex     = nullptr;
953 
954     surfaceIndex = value + indexSurfaceArray;
955     CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceIndex);
956 
957     if (surfaceIndex->get_data() == CM_NULL_SURFACE
958         || surfaceIndex->get_data() == 0)
959     {
960         surface = (CmSurface *)CM_NULL_SURFACE;
961         goto finish;
962     }
963 
964     m_surfaceMgr->GetSurface(surfaceIndex->get_data(), surface);
965 
966 finish:
967     return surface;
968 }
969 
970 //*-----------------------------------------------------------------------------
971 //| Purpose:    Set kernel arg for single vme surface or multiple vme surfaces
972 //|             in surface array. So far, don't support vme surface array in thread arg.
973 //| Returns:    Result of the operation.
974 //*-----------------------------------------------------------------------------
SetArgsVme(CM_KERNEL_INTERNAL_ARG_TYPE nArgType,uint32_t argIndex,const void * value,uint32_t nThreadID)975 int32_t CmKernelRT::SetArgsVme(CM_KERNEL_INTERNAL_ARG_TYPE nArgType, uint32_t argIndex, const void *value, uint32_t nThreadID)
976 {
977     uint32_t elementNum = 0;
978     CM_ARG& arg        = m_args[ argIndex ];
979     uint32_t totalVmeArgValueSize       = 0;
980     uint32_t totalSurfacesInVme         = 0;
981     uint32_t tempVmeArgValueSize        = 0;
982     uint32_t vmeArgValueOffset          = 0;
983     uint32_t lastVmeSurfCount           = 0;
984     CmSurfaceVme* surfVme          = nullptr;
985     uint8_t *vmeArgValueArray         = nullptr;
986     uint16_t *vmeCmIndexArray          = nullptr;
987     int32_t hr = CM_SUCCESS;
988 
989     //Get Number of elements in surface array
990     if (arg.unitVmeArraySize == 0)
991     {  //First Time
992         elementNum = arg.unitSize / sizeof(uint32_t);
993     }
994     else
995     {
996         elementNum = arg.unitVmeArraySize;
997     }
998 
999     //Get Size of vmeIndexArray and vmeCmIndexArray.
1000     for(uint32_t i=0; i< elementNum; i++)
1001     {
1002         if (((SurfaceIndex*)(value)+i)->get_data() == 0 || ((SurfaceIndex*)(value)+i)->get_data() == CM_NULL_SURFACE)
1003         {
1004             tempVmeArgValueSize = sizeof(CM_HAL_VME_ARG_VALUE);
1005             totalVmeArgValueSize += tempVmeArgValueSize;
1006             totalSurfacesInVme++;
1007         }
1008         else
1009         {
1010             surfVme = static_cast<CmSurfaceVme*>(GetSurfaceFromSurfaceArray((SurfaceIndex*)value, i));
1011             CM_CHK_NULL_GOTOFINISH_CMERROR(surfVme);
1012             tempVmeArgValueSize = surfVme->GetVmeCmArgSize();
1013             totalVmeArgValueSize += tempVmeArgValueSize;
1014             totalSurfacesInVme += surfVme->GetTotalSurfacesCount();
1015         }
1016     }
1017 
1018     // Allocate and Zero Memory for arg.pValue and arg.surfIndex
1019     // arg.pValue    : an array of CM_HAL_VME_ARG_VALUE structure followed by an array of reference surfaces
1020     // arg.surfIndex : an array listing all the Cm surface indexes, in the order of current, fw surfaces, bw surfaces
1021 
1022     if (arg.unitSize < totalVmeArgValueSize) // need to re-allocate larger area)
1023     {
1024         if (arg.value)
1025         {
1026             MosSafeDeleteArray(arg.value);
1027         }
1028         arg.value = MOS_NewArray(uint8_t, totalVmeArgValueSize);
1029 
1030         if (arg.surfIndex)
1031         {
1032             MosSafeDeleteArray(arg.surfIndex);
1033         }
1034         arg.surfIndex = MOS_NewArray(uint16_t, totalSurfacesInVme);
1035     }
1036 
1037     CM_CHK_NULL_GOTOFINISH_CMERROR(arg.value);
1038     CmSafeMemSet(arg.value, 0, totalVmeArgValueSize);
1039     CM_CHK_NULL_GOTOFINISH_CMERROR(arg.surfIndex);
1040     CmSafeMemSet(arg.surfIndex, 0, totalSurfacesInVme * sizeof(uint16_t));
1041 
1042     //Set each Vme Surface
1043     for (uint32_t i = 0; i< elementNum; i++)
1044     {
1045         if (((SurfaceIndex*)(value)+i)->get_data() == 0 || ((SurfaceIndex*)(value)+i)->get_data() == CM_NULL_SURFACE)
1046         {
1047             PCM_HAL_VME_ARG_VALUE vmeArg = (PCM_HAL_VME_ARG_VALUE)(arg.value + vmeArgValueOffset);
1048             vmeArg->fwRefNum = 0;
1049             vmeArg->bwRefNum = 0;
1050             vmeArg->curSurface = CM_NULL_SURFACE;
1051             tempVmeArgValueSize = sizeof(CM_HAL_VME_ARG_VALUE);
1052             vmeArgValueOffset += tempVmeArgValueSize;
1053             arg.surfIndex[lastVmeSurfCount] = CM_NULL_SURFACE;
1054             lastVmeSurfCount++;
1055         }
1056         else
1057         {
1058             surfVme = static_cast<CmSurfaceVme*>(GetSurfaceFromSurfaceArray((SurfaceIndex*)value, i));
1059             CM_CHK_NULL_GOTOFINISH_CMERROR(surfVme);
1060             SetArgsSingleVme(surfVme, arg.value + vmeArgValueOffset, arg.surfIndex + lastVmeSurfCount);
1061             tempVmeArgValueSize = surfVme->GetVmeCmArgSize();
1062             vmeArgValueOffset += tempVmeArgValueSize;
1063             lastVmeSurfCount += surfVme->GetTotalSurfacesCount();
1064         }
1065     }
1066 
1067     if ( nArgType == CM_KERNEL_INTERNEL_ARG_PERKERNEL ) // per kernel arg
1068     {
1069         // First time set
1070         if( !arg.value )
1071         {   // Increment size kernel arguments will take up in CURBE
1072             m_sizeInCurbe += CM_ARGUMENT_SURFACE_SIZE * elementNum;
1073         }
1074 
1075         arg.unitCount = 1;
1076         arg.isDirty  = true;
1077         arg.isSet    = true;
1078         arg.unitKind  = ARG_KIND_SURFACE_VME;
1079         arg.unitSize = (uint16_t)totalVmeArgValueSize; // the unitSize can't represent surfaces count here
1080         arg.unitVmeArraySize = elementNum;
1081 
1082         m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1083         m_perKernelArgExists = true;
1084     }
1085     else
1086     {
1087         // Thread arg doesn't support VME surfaces as it is rarely used and it is complex to implement,
1088         // since each thread may has different surface number in its vme surface argment.
1089         hr = CM_THREAD_ARG_NOT_ALLOWED;
1090     }
1091 
1092 finish:
1093     if(hr != CM_SUCCESS)
1094     {
1095         MosSafeDeleteArray(arg.value);
1096         MosSafeDeleteArray(arg.surfIndex);
1097     }
1098     return hr;
1099 
1100 }
1101 
1102 //*-----------------------------------------------------------------------------
1103 //| Purpose:    Fill arg for a single vme surface.
1104 //|             vmeIndexArray points to arg.pValue
1105 //|             vmeCmIndexArray points to arg.surfIndex
1106 //| Returns:    Result of the operation.
1107 //*-----------------------------------------------------------------------------
SetArgsSingleVme(CmSurfaceVme * vmeSurface,uint8_t * vmeArgValueArray,uint16_t * cmSufacesArray)1108 int32_t CmKernelRT::SetArgsSingleVme(CmSurfaceVme* vmeSurface, uint8_t *vmeArgValueArray, uint16_t *cmSufacesArray)
1109 {
1110 
1111     int32_t hr = CM_SUCCESS;
1112     CM_SURFACE_MEM_OBJ_CTRL memCtl;
1113     uint32_t vmeBackwardSurfaceCount        = 0;
1114     uint32_t vmeForwardSurfaceCount         = 0;
1115     uint32_t vmeCurrentSurfaceIndex         = 0;
1116     uint16_t vmeCurrentCmIndex              = 0;
1117     int32_t vmeIndexArrayPosition          = 0; // Offset for vmeIndexArray
1118     int32_t vmeCmIndexArrayPosition        = 0; // Offset for vmeCmIndexArray
1119     uint32_t tempOutput                     = 0;
1120     uint32_t cmSurfArrayIdx                 = 0;
1121     uint32_t surfStateWidth                 = 0;
1122     uint32_t surfStateHeight                = 0;
1123 
1124     uint32_t *fArray       = nullptr;
1125     uint32_t *bArray       = nullptr;
1126     uint32_t *fCmIndex     = nullptr;
1127     uint32_t *bCmIndex     = nullptr;
1128 
1129     uint32_t *fwSurfInArg = nullptr;
1130     uint32_t *bwSurfInArg = nullptr;
1131 
1132     CmSurface *surface = nullptr;
1133     PCM_HAL_VME_ARG_VALUE vmeArg = (PCM_HAL_VME_ARG_VALUE)vmeArgValueArray;
1134 
1135     CM_CHK_NULL_GOTOFINISH_CMERROR(vmeSurface);
1136     CM_CHK_NULL_GOTOFINISH_CMERROR(vmeArg);
1137     CM_CHK_NULL_GOTOFINISH_CMERROR(cmSufacesArray);
1138 
1139     if(vmeSurface == (CmSurfaceVme *)CM_NULL_SURFACE)
1140     {
1141         vmeArg->fwRefNum = 0;
1142         vmeArg->bwRefNum = 0;
1143         vmeArg->curSurface = CM_NULL_SURFACE;
1144         cmSufacesArray[cmSurfArrayIdx] =  CM_NULL_SURFACE;
1145         return hr;
1146     }
1147 
1148     // Get Vme Backward Forward Surface Count
1149     vmeSurface->GetIndexBackwardCount(vmeBackwardSurfaceCount);
1150     vmeSurface->GetIndexForwardCount(vmeForwardSurfaceCount);
1151 
1152     vmeArg->fwRefNum = vmeForwardSurfaceCount;
1153     vmeArg->bwRefNum = vmeBackwardSurfaceCount; // these two numbers must be set before any other operations
1154 
1155     vmeSurface->GetSurfaceStateResolution(vmeArg->surfStateParam.surfaceStateWidth, vmeArg->surfStateParam.surfaceStateHeight);
1156 
1157     vmeSurface->GetIndexForwardArray(fArray);
1158     vmeSurface->GetIndexBackwardArray(bArray);
1159     vmeSurface->GetIndexCurrent(vmeCurrentSurfaceIndex);
1160 
1161     vmeSurface->GetCmIndexCurrent(vmeCurrentCmIndex);
1162     vmeSurface->GetCmIndexForwardArray(fCmIndex);
1163     vmeSurface->GetCmIndexBackwardArray(bCmIndex);
1164 
1165     cmSufacesArray[cmSurfArrayIdx++] = vmeCurrentCmIndex;
1166 
1167     // Set Current Vme Surface
1168     m_surfaceMgr->GetSurface(vmeCurrentCmIndex, surface);
1169     CM_CHK_NULL_GOTOFINISH_CMERROR(surface);
1170 
1171     vmeArg->curSurface = vmeCurrentSurfaceIndex;
1172 
1173     //Set Forward Vme Surfaces
1174     fwSurfInArg = findFwRefInVmeArg(vmeArg);
1175     for (uint32_t i = 0; i < vmeForwardSurfaceCount; i++)
1176     {
1177         GetVmeSurfaceIndex( fArray, fCmIndex, i, &tempOutput);
1178         fwSurfInArg[i] = tempOutput;
1179         cmSufacesArray[cmSurfArrayIdx++] = (uint16_t)fCmIndex[i];
1180     }
1181 
1182     //Set Backward Vme Surfaces
1183     bwSurfInArg = findBwRefInVmeArg(vmeArg);
1184     for (uint32_t i = 0; i < vmeBackwardSurfaceCount; i++)
1185     {
1186         GetVmeSurfaceIndex( bArray, bCmIndex, i, &tempOutput);
1187         bwSurfInArg[i] = tempOutput;
1188         cmSufacesArray[cmSurfArrayIdx++] = (uint16_t)bCmIndex[i];
1189     }
1190 
1191 finish:
1192     return hr;
1193 }
1194 
1195 //*-----------------------------------------------------------------------------
1196 //| Purpose:    Get Vme Surface Index with memory object setting .
1197 //|             Output value will be filled into arg.pValue
1198 //| Returns:    Result of the operation.
1199 //*-----------------------------------------------------------------------------
GetVmeSurfaceIndex(uint32_t * vmeIndexArray,uint32_t * vmeCmIndexArray,uint32_t index,uint32_t * outputValue)1200 int32_t CmKernelRT::GetVmeSurfaceIndex(
1201     uint32_t *vmeIndexArray,
1202     uint32_t *vmeCmIndexArray,
1203     uint32_t index,
1204     uint32_t *outputValue)
1205 {
1206     int32_t hr = CM_SUCCESS;
1207     uint32_t value = vmeIndexArray[index];
1208 
1209     if (vmeIndexArray[index] == CM_INVALID_VME_SURFACE)
1210     {
1211         value = CM_NULL_SURFACE;
1212     }
1213 
1214     *outputValue = value;
1215 
1216     return hr;
1217 }
1218 
1219 //*-----------------------------------------------------------------------------
1220 //| Purpose:    Set arguments for function SetKernelArg().
1221 //|             Kernel argument is surface array.
1222 //! INPUT:
1223 //!             1) Current index in surface array
1224 //!             2) Index of kernel argument
1225 //!             3) Surface count in surface array
1226 //!             4) Pointer to current surface in surface array.
1227 //!             5) Current surface  index
1228 //!             6) Pointer to argument value
1229 //!             7) value of surface handle combined with memory object control
1230 //!             8) Original surface index for each surface in array
1231 //| Returns:    Result of the operation.
1232 //*-----------------------------------------------------------------------------
SetArgsInternalSurfArray(int32_t offset,uint32_t kernelArgIndex,int32_t surfCount,CmSurface * currentSurface,uint32_t currentSurfIndex,SurfaceIndex * value,uint32_t surfValue[],uint16_t origSurfIndex[])1233 int32_t CmKernelRT::SetArgsInternalSurfArray(
1234     int32_t offset,uint32_t kernelArgIndex,
1235     int32_t surfCount, CmSurface* currentSurface,
1236     uint32_t currentSurfIndex, SurfaceIndex* value,
1237     uint32_t surfValue[], uint16_t origSurfIndex[])
1238 {
1239     CM_SURFACE_MEM_OBJ_CTRL memCtl;
1240     uint32_t                surfRegTableIndex = 0;
1241     uint32_t                handle = 0;
1242     uint32_t                samplerIndex;
1243     uint16_t                samplerCmIndex;
1244     uint32_t                surfaceArraySize = 0;
1245 
1246     m_surfaceMgr->GetSurfaceArraySize(surfaceArraySize);
1247     MosSafeDeleteArray(m_args[kernelArgIndex].surfArrayArg); // delete it if it was allcated
1248     m_args[kernelArgIndex].surfArrayArg = MOS_NewArray(SURFACE_ARRAY_ARG, surfCount);
1249     if (!m_args[kernelArgIndex].surfArrayArg)
1250     {
1251         CM_ASSERTMESSAGE("Error: Out of system memory.");
1252         return CM_OUT_OF_HOST_MEMORY;
1253     }
1254     CmSafeMemSet((void *)m_args[kernelArgIndex].surfArrayArg, 0,  sizeof(SURFACE_ARRAY_ARG) * surfCount);
1255     while (offset < surfCount)
1256     {
1257         switch (currentSurface->Type())
1258         {
1259           case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
1260           {
1261              CmSurface2DRT* surf2D = static_cast<CmSurface2DRT*>(currentSurface);
1262 
1263              uint32_t numAliases = 0;
1264              surf2D->GetNumAliases(numAliases);
1265              if (numAliases)
1266              {
1267                  m_args[kernelArgIndex].aliasCreated = true;
1268              }
1269              else
1270              {
1271                  m_args[kernelArgIndex].aliasCreated = false;
1272              }
1273 
1274              //set memory object control
1275              surf2D->GetIndexFor2D(surfRegTableIndex);
1276 
1277              surfValue[offset] = surfRegTableIndex;
1278              origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1279 
1280              m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_2D;
1281              m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_2D;
1282 
1283              break;
1284          }
1285          case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
1286          {
1287              CmBuffer_RT* surf1D = static_cast<CmBuffer_RT*>(currentSurface);
1288 
1289              uint32_t numAliases = 0;
1290              surf1D->GetNumAliases(numAliases);
1291              if (numAliases)
1292              {
1293                  m_args[kernelArgIndex].aliasCreated = true;
1294              }
1295              else
1296              {
1297                  m_args[kernelArgIndex].aliasCreated = false;
1298              }
1299 
1300              //set memory object control
1301              surf1D->GetHandle(handle);
1302 
1303              surfValue[offset] = handle;
1304              origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1305 
1306              m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_1D;
1307              m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_1D;
1308              break;
1309          }
1310          case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
1311          {
1312              CmSurface2DUPRT* surf2DUP = static_cast<CmSurface2DUPRT*>(currentSurface);
1313 
1314              //set memory object
1315              surf2DUP->GetHandle(handle);
1316 
1317              surfValue[offset] = handle;
1318              origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1319 
1320              m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_2D_UP;
1321              m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_2D_UP;
1322              break;
1323          }
1324          case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
1325          {
1326              CmSurface3DRT* surf3D = static_cast<CmSurface3DRT*>(currentSurface);
1327 
1328              surf3D->GetHandle(handle);
1329 
1330              surfValue[offset] = handle;
1331              origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1332 
1333              m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_3D;
1334              m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_3D;
1335 
1336              break;
1337          }
1338 
1339          case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER:
1340          {
1341              CmStateBuffer* stateBuffer = static_cast< CmStateBuffer* >( currentSurface );
1342              stateBuffer->GetHandle( handle );
1343 
1344              surfValue[ offset ] = handle;
1345              origSurfIndex[ offset ] = ( uint16_t )currentSurfIndex;
1346 
1347              m_args[ kernelArgIndex ].surfArrayArg[ offset ].argKindForArray = ARG_KIND_STATE_BUFFER;
1348              m_args[ kernelArgIndex ].unitKind = ARG_KIND_STATE_BUFFER;
1349 
1350              break;
1351          }
1352 
1353          //sampler surface
1354          case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
1355          {
1356              CmSurfaceSampler* surfSampler = static_cast <CmSurfaceSampler *> (currentSurface);
1357              surfSampler->GetHandle(samplerIndex);
1358              surfSampler->GetCmIndexCurrent(samplerCmIndex);
1359 
1360              m_surfaceMgr->GetSurface(samplerCmIndex, currentSurface);
1361              if (!currentSurface)
1362              {
1363                  CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1364                  return CM_NULL_POINTER;
1365              }
1366 
1367              surfValue[offset] = samplerIndex;
1368              origSurfIndex[offset] = (uint16_t)samplerCmIndex;
1369 
1370              SAMPLER_SURFACE_TYPE type;
1371              surfSampler->GetSurfaceType(type);
1372              if (type == SAMPLER_SURFACE_TYPE_2D)
1373              {
1374                  m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER;
1375                  m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER;
1376              }
1377              else if (type == SAMPLER_SURFACE_TYPE_2DUP)
1378              {
1379                  m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE2DUP_SAMPLER;
1380                  m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE2DUP_SAMPLER;
1381              }
1382              else if(type == SAMPLER_SURFACE_TYPE_3D)
1383              {
1384                  m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_3D;
1385                  m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_3D;
1386              }
1387              else
1388              {
1389                  CM_ASSERTMESSAGE("Error: Assign a Sampler surface to the arg which is previously 2D/3D surface.");
1390                  return CM_FAILURE;
1391              }
1392              break;
1393          }
1394          //sampler8x8surface
1395          case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
1396          {
1397              CmSurfaceSampler8x8* surfSampler8x8 = static_cast <CmSurfaceSampler8x8 *> (currentSurface);
1398              surfSampler8x8->GetIndexCurrent(samplerIndex);
1399              surfSampler8x8->GetCmIndex(samplerCmIndex);
1400 
1401              m_surfaceMgr->GetSurface(samplerCmIndex, currentSurface);
1402              if (!currentSurface)
1403              {
1404                  CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1405                  return CM_FAILURE;
1406              }
1407 
1408              surfValue[offset] = samplerIndex;
1409              origSurfIndex[offset] = (uint16_t)samplerCmIndex;
1410 
1411              CM_SAMPLER8x8_SURFACE type;
1412              type = surfSampler8x8->GetSampler8x8SurfaceType();
1413              if (type == CM_VA_SURFACE)
1414              {
1415                  m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1416                  m_args[kernelArgIndex].surfArrayArg[offset].addressModeForArray = surfSampler8x8->GetAddressControlMode();
1417                  m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1418              }
1419              else if(type == CM_AVS_SURFACE)
1420              {
1421                  m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1422                  m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1423              }
1424              else
1425              {
1426                  CM_ASSERTMESSAGE("Error: Assign a Sampler8x8 surface to the arg which is previously 2D surface.");
1427                  return CM_FAILURE;
1428              }
1429              break;
1430          }
1431          default:
1432          {
1433              CM_ASSERTMESSAGE("Error: No matched surface for surface array");
1434              return CM_INVALID_ARG_VALUE;
1435          }
1436        }
1437        offset++;
1438        if (offset < surfCount)
1439        {
1440            currentSurfIndex = value[offset].get_data();
1441 
1442            while ((!currentSurfIndex && (offset < surfCount))||(currentSurfIndex == CM_NULL_SURFACE))
1443            {
1444                surfValue[offset] = CM_NULL_SURFACE;
1445                origSurfIndex[offset] = 0;
1446                offset++;
1447                if (offset >= surfCount)
1448                    break;
1449                currentSurfIndex = value[offset].get_data();
1450            }
1451 
1452            if(surfaceArraySize == 0)
1453            {
1454                CM_ASSERTMESSAGE("Error: No surface in surface array");
1455                return CM_NO_AVAILABLE_SURFACE;
1456            }
1457            if (currentSurfIndex > surfaceArraySize)
1458            {
1459                currentSurfIndex = currentSurfIndex % surfaceArraySize;
1460            }
1461        }
1462        if (offset < surfCount)
1463        {
1464            m_surfaceMgr->GetSurface(currentSurfIndex, currentSurface);
1465            if (nullptr == currentSurface)
1466            {
1467                CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1468                return CM_FAILURE;
1469            }
1470        }
1471     }
1472     return CM_SUCCESS;
1473 }
1474 //*-----------------------------------------------------------------------------
1475 // Set arguments for function SetKernelArg() and SetThreadArg()
1476 // Set parameter nArgType to CM_KERNEL_INTERNEL_ARG_KERNEL to set a kernel
1477 // argument; set parameter nArgType to CM_KERNEL_INTERNEL_ARG_THREAD to set
1478 // a thread argument
1479 //*-----------------------------------------------------------------------------
SetArgsInternal(CM_KERNEL_INTERNAL_ARG_TYPE nArgType,uint32_t index,size_t size,const void * value,uint32_t nThreadID)1480 int32_t CmKernelRT::SetArgsInternal( CM_KERNEL_INTERNAL_ARG_TYPE nArgType, uint32_t index, size_t size, const void *value, uint32_t nThreadID )
1481 {
1482     uint32_t surfRegTableIndex = 0; // for 2D surf
1483     uint32_t handle = 0; // for 1D surf
1484 
1485     uint32_t samplerIndex;
1486     uint16_t samplerCmIndex;
1487     uint32_t samplerIdx = 0;
1488     uint32_t vmeIdx = 0;
1489     uint16_t *surfIndexValue =  nullptr;
1490     uint32_t surfaces[CM_MAX_ARGS_PER_KERNEL];
1491     uint16_t surfIndexArray[CM_MAX_ARGS_PER_KERNEL];
1492     std::vector< int > sampler_index_array;
1493 
1494     //Clear "set" flag in case user call API to set the same one argument multiple times.
1495     m_args[index].isSet = false;
1496     if( m_args[ index ].unitKind == ARG_KIND_GENERAL || (m_args[index].unitKind == ARG_KIND_GENERAL_DEPVEC) || (m_args[index].unitKind == ARG_KIND_GENERAL_DEPCNT))
1497     {
1498         if( size != m_args[ index ].unitSize )
1499         {
1500             CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
1501             return CM_INVALID_ARG_SIZE;
1502         }
1503     }
1504     //For surface type
1505     else if (CHECK_SURFACE_TYPE(m_args[index].unitKind,
1506         ARG_KIND_SURFACE,
1507         ARG_KIND_SURFACE_1D,
1508         ARG_KIND_SURFACE_2D,
1509         ARG_KIND_SURFACE_2D_UP,
1510         ARG_KIND_SURFACE_3D,
1511         ARG_KIND_SURFACE_SAMPLER,
1512         ARG_KIND_SURFACE2DUP_SAMPLER,
1513         ARG_KIND_SURFACE_VME,
1514         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
1515         ARG_KIND_SURFACE_SAMPLER8X8_VA,
1516         ARG_KIND_SURFACE_2D_SCOREBOARD,
1517         ARG_KIND_STATE_BUFFER
1518         ))
1519     {
1520 
1521         // this code is to convert SurfaceIndex object to index of type uint32_t,
1522         // which is expected by commonISA/genBinary
1523         // index is the index of the surface in surface registration table of CM device
1524         // in driver
1525 
1526         int signatureSize = m_args[index].unitSize;
1527         int numSurfaces = signatureSize / sizeof(int);
1528         SurfaceIndex* surfIndex = (SurfaceIndex*)value;
1529         if (surfIndex == (SurfaceIndex*)CM_NULL_SURFACE)
1530         {
1531             m_args[index].isSet = true;
1532             m_args[index].unitCount = 1; // per kernel arg
1533             m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1534             m_perKernelArgExists = true;
1535             m_args[index].isDirty = true;
1536             m_args[index].isNull = true;
1537             return CM_SUCCESS;
1538         }
1539         else
1540         {
1541             // In case that CM_NULL_SURFACE was set at last time and will
1542             // set a read surface index this time. So need set isDirty as
1543             // well to indicate update kernel data.
1544             if (m_args[index].isNull == true)
1545             {
1546                 m_args[index].isDirty = true;
1547                 m_args[index].isNull = false;
1548             }
1549         }
1550 
1551         m_args[index].isNull = false;
1552         CM_SURFACE_MEM_OBJ_CTRL memCtl;
1553 
1554         if (m_args[index].unitKind != ARG_KIND_SURFACE_VME)
1555         {
1556             if (size != sizeof(SurfaceIndex)* numSurfaces)
1557             {
1558                 CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
1559                 return CM_INVALID_ARG_SIZE;
1560             }
1561         }
1562 
1563         uint32_t surfIndexData = surfIndex->get_data();
1564         int i = 0;
1565         uint32_t surfaceArraySize = 0;
1566         m_surfaceMgr->GetSurfaceArraySize(surfaceArraySize);
1567 
1568         if (surfIndexData > surfaceArraySize)
1569         {
1570             if (m_args[index].aliasIndex != surfIndexData)
1571             {
1572                 m_args[index].isDirty = true;
1573                 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1574                 m_args[index].aliasIndex = surfIndexData;
1575             }
1576 
1577             surfIndexData = surfIndexData % surfaceArraySize;
1578         }
1579         else
1580         {
1581             m_args[index].aliasIndex = 0;
1582         }
1583 
1584         while (!surfIndexData && (i < numSurfaces))
1585         {
1586             surfaces[i] = CM_NULL_SURFACE;
1587             surfIndexArray[i] = 0;
1588             i++;
1589             if (i >= numSurfaces)
1590                 break;
1591             surfIndexData = surfIndex[i].get_data();
1592         }
1593 
1594         if (i >= numSurfaces)
1595         {
1596             m_args[index].unitKind = ARG_KIND_SURFACE_2D;
1597             value = surfaces;
1598             size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1599             m_args[index].unitSize = (uint16_t)size;
1600             goto finish;
1601         }
1602         CmSurface* surface = nullptr;
1603         m_surfaceMgr->GetSurface(surfIndexData, surface);
1604         if (nullptr == surface)
1605         {
1606             CM_ASSERTMESSAGE("Error: Invalid surface.");
1607             return CM_FAILURE;
1608         }
1609 
1610         if (SurfTypeToArgKind(surface->Type()) != m_args[index].unitKind)
1611         {   // if surface type changes i.e 2D <-> 2DUP  Need to set bIsDrity as true
1612             m_args[index].isDirty = true;
1613             m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1614         }
1615 
1616         uint32_t cisaMajorVersion, cisaMinorVersion;
1617         m_program->GetCISAVersion(cisaMajorVersion, cisaMinorVersion);
1618 
1619         //This path is for surface array, including 1D, 2D, 3D,samplersurface, samplersurface8x8
1620         if ((numSurfaces > 1) && (surface->Type() != CM_ENUM_CLASS_TYPE_CMSURFACEVME))
1621         {
1622             int32_t hr = SetArgsInternalSurfArray(i,index, numSurfaces, surface, surfIndexData, surfIndex,surfaces, surfIndexArray);
1623             if (hr != CM_SUCCESS)
1624             {
1625                 CM_ASSERTMESSAGE("Error: SetArgsInternal for surface array failed!\n");
1626                 return CM_INVALID_ARG_VALUE;
1627             }
1628             value = surfaces;
1629             surfIndexValue = surfIndexArray;
1630             size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1631             m_args[index].unitSize = (uint16_t)size;
1632         }
1633         else
1634         {   //This is for single surface and surface array for VME surface
1635             switch (surface->Type())
1636             {
1637                  case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
1638                  {
1639                      CmSurface2DRT* surf2D = static_cast<CmSurface2DRT*>(surface);
1640 
1641                      uint32_t numAliases = 0;
1642                      surf2D->GetNumAliases(numAliases);
1643                      if (numAliases)
1644                      {
1645                          m_args[index].aliasCreated = true;
1646                      }
1647                      else
1648                      {
1649                          m_args[index].aliasCreated = false;
1650                      }
1651 
1652                      //set memory object control
1653                      surf2D->GetIndexFor2D(surfRegTableIndex);
1654 
1655                      surfaces[i] = surfRegTableIndex;
1656                      surfIndexArray[i] = (uint16_t)surfIndexData;
1657 
1658                      value = surfaces;
1659                      surfIndexValue = surfIndexArray;
1660 
1661                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1662                      m_args[index].unitSize = (uint16_t)size;
1663 
1664                      if ((m_args[index].unitKind == ARG_KIND_SURFACE) || (m_args[index].unitKind == ARG_KIND_SURFACE_2D_UP)) // first time or last time is set to 2DUP
1665                      {
1666                          m_args[index].unitKind = ARG_KIND_SURFACE_2D;
1667                          if (m_args[index].surfaceKind == SAMPLER_SURF)
1668                              m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER;
1669                      }
1670                      else if (m_args[index].unitKind != ARG_KIND_SURFACE_2D &&
1671                          m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER &&
1672                          m_args[index].unitKind != ARG_KIND_SURFACE2DUP_SAMPLER &&
1673                          m_args[index].unitKind != ARG_KIND_SURFACE_2D_SCOREBOARD)
1674                      {
1675                          CM_ASSERTMESSAGE("Error: Assign a 2D surface to the arg which is previously assigned 1D surface, 3D surface, or VME surface.");
1676                          return CM_INVALID_ARG_VALUE;
1677                      }
1678                      break;
1679                  }
1680                  case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
1681                  {
1682                      CmBuffer_RT* surf1D = static_cast<CmBuffer_RT*>(surface);
1683 
1684                      uint32_t numAliases = 0;
1685                      surf1D->GetNumAliases(numAliases);
1686                      if (numAliases)
1687                      {
1688                          m_args[index].aliasCreated = true;
1689                      }
1690                      else
1691                      {
1692                          m_args[index].aliasCreated = false;
1693                      }
1694 
1695                      //set memory object control
1696                      surf1D->GetHandle(handle);
1697 
1698                      surfaces[i] = handle;
1699                      surfIndexArray[i] = (uint16_t)surfIndexData;
1700 
1701                      value = surfaces;
1702                      surfIndexValue = surfIndexArray;
1703 
1704                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1705                      m_args[index].unitSize = (uint16_t)size;
1706 
1707                      if (m_args[index].unitKind == ARG_KIND_SURFACE)
1708                      {
1709                          m_args[index].unitKind = ARG_KIND_SURFACE_1D;
1710                      }
1711                      else if (m_args[index].unitKind != ARG_KIND_SURFACE_1D)
1712                      {
1713                          CM_ASSERTMESSAGE("Error: Assign a 1D surface to the arg which is previously assigned 2D surface, 3D surface, or VME surface.");
1714                          return CM_INVALID_ARG_VALUE;
1715                      }
1716                      break;
1717                  }
1718                  case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
1719                  {
1720                      CmSurface2DUPRT* surf2DUP = static_cast<CmSurface2DUPRT*>(surface);
1721 
1722                      //set memory object
1723                      surf2DUP->GetHandle(handle);
1724 
1725                      surfaces[i] = handle;
1726                      surfIndexArray[i] = (uint16_t)surfIndexData;
1727 
1728                      value = surfaces;
1729                      surfIndexValue = surfIndexArray;
1730 
1731                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1732                      m_args[index].unitSize = (uint16_t)size;
1733 
1734                      if ((m_args[index].unitKind == ARG_KIND_SURFACE) || (m_args[index].unitKind == ARG_KIND_SURFACE_2D)) // first time or last time is set to 2D
1735                      {
1736                          m_args[index].unitKind = ARG_KIND_SURFACE_2D_UP;
1737                      }
1738                      else if (m_args[index].unitKind != ARG_KIND_SURFACE_2D_UP)
1739                      {
1740                          CM_ASSERTMESSAGE("Error: Assign a 2D surface UP to the arg which is previously assigned other surfaces.");
1741                          return CM_INVALID_ARG_VALUE;
1742                      }
1743 
1744                      break;
1745                  }
1746                  case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
1747                  {
1748                      CmSurface3DRT* surf3D = static_cast<CmSurface3DRT*>(surface);
1749 
1750                      surf3D->GetHandle(handle);
1751 
1752                      surfaces[i] = handle;
1753                      surfIndexArray[i] = (uint16_t)surfIndexData;
1754 
1755                      value = surfaces;
1756                      surfIndexValue = surfIndexArray;
1757 
1758                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1759                      m_args[index].unitSize = (uint16_t)size;
1760 
1761                      if (m_args[index].unitKind == ARG_KIND_SURFACE) // first time
1762                      {
1763                          m_args[index].unitKind = ARG_KIND_SURFACE_3D;
1764                      }
1765                      else if (m_args[index].unitKind != ARG_KIND_SURFACE_3D)
1766                      {
1767                          CM_ASSERTMESSAGE("Error: Assign a 3D surface to the arg which is previously assigned 1D surface, 2D surface or VME surface");
1768                          return CM_INVALID_ARG_VALUE;
1769                      }
1770                      break;
1771                  }
1772 
1773                  case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER:
1774                  {
1775                      CmStateBuffer* stateBuffer = static_cast< CmStateBuffer* >( surface );
1776                      stateBuffer->GetHandle( handle );
1777 
1778                      surfaces[ i ] = handle;
1779                      surfIndexArray[ i ] = ( uint16_t )surfIndexData;
1780 
1781                      value = surfaces;
1782                      surfIndexValue = surfIndexArray;
1783 
1784                      size = ( size / sizeof( SurfaceIndex ) ) * sizeof( uint32_t );
1785                      m_args[ index ].unitSize = ( uint16_t )size;
1786 
1787                      if ( m_args[ index ].unitKind == ARG_KIND_SURFACE ) // first time
1788                      {
1789                          m_args[ index ].unitKind = ARG_KIND_STATE_BUFFER;
1790                      }
1791                      else if ( m_args[ index ].unitKind != ARG_KIND_STATE_BUFFER )
1792                      {
1793                          CM_ASSERTMESSAGE( "Error: Assign a state buffer to the arg which is previously assigned 1D surface, 2D surface, 3D surface or VME surface" );
1794                          return CM_INVALID_ARG_VALUE;
1795                      }
1796                      break;
1797                  }
1798 
1799                  case CM_ENUM_CLASS_TYPE_CMSURFACEVME:
1800                  {
1801                      return SetArgsVme(nArgType, index, value, nThreadID);
1802                  }
1803                  case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
1804                  {
1805                      CmSurfaceSampler8x8* surfSampler8x8 = static_cast <CmSurfaceSampler8x8 *> (surface);
1806                      surfSampler8x8->GetIndexCurrent(samplerIndex);
1807                      surfSampler8x8->GetCmIndex(samplerCmIndex);
1808                      if (samplerCmIndex > surfaceArraySize)
1809                      {
1810                          m_args[index].aliasIndex = samplerCmIndex;
1811                          m_args[index].aliasCreated = true;
1812                          samplerCmIndex %= surfaceArraySize;
1813                      }
1814 
1815                      m_surfaceMgr->GetSurface(samplerCmIndex, surface);
1816                      if (!surface)
1817                      {
1818                          CM_ASSERTMESSAGE("Error: Invalid sampler8x8 surface.");
1819                          return CM_FAILURE;
1820                      }
1821 
1822                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1823                      m_args[index].unitSize = (uint16_t)size;
1824 
1825                      value = &samplerIndex;
1826                      surfIndexValue = &samplerCmIndex;
1827 
1828                      if (m_args[index].unitKind == ARG_KIND_SURFACE)
1829                      {
1830                          if (surfSampler8x8->GetSampler8x8SurfaceType() == CM_VA_SURFACE)
1831                          {
1832                              m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1833                              m_args[index].nCustomValue = surfSampler8x8->GetAddressControlMode();
1834                          }
1835                          else
1836                          {
1837                              m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1838                          }
1839                      }
1840                      else if (m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER8X8_AVS &&
1841                          m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER8X8_VA)
1842                      {
1843                          CM_ASSERTMESSAGE("Error: Assign a Sampler8x8 surface to the arg which is previously 2D surface.");
1844                          return CM_FAILURE;
1845                      }
1846                      break;
1847                  }
1848                  case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
1849                  {
1850                      CmSurfaceSampler* surfSampler = static_cast <CmSurfaceSampler *> (surface);
1851                      surfSampler->GetHandle(samplerIndex);
1852                      surfSampler->GetCmIndexCurrent(samplerCmIndex);
1853 
1854                      m_surfaceMgr->GetSurface(samplerCmIndex, surface);
1855                      if (!surface)
1856                      {
1857                          CM_ASSERTMESSAGE("Error: Invalid sampler surface.");
1858                          return CM_FAILURE;
1859                      }
1860 
1861                      size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1862                      m_args[index].unitSize = (uint16_t)size;
1863 
1864                      value = &samplerIndex;
1865                      surfIndexValue = &samplerCmIndex;
1866 
1867                      if (m_args[index].unitKind == ARG_KIND_SURFACE)
1868                      {   // first time
1869                          SAMPLER_SURFACE_TYPE type;
1870                          surfSampler->GetSurfaceType(type);
1871                          if (type == SAMPLER_SURFACE_TYPE_2D)
1872                          {
1873                              m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER;
1874                          }
1875                          else if (type == SAMPLER_SURFACE_TYPE_2DUP)
1876                          {
1877                              m_args[index].unitKind = ARG_KIND_SURFACE2DUP_SAMPLER;
1878                          }
1879                          else
1880                          {
1881                              m_args[index].unitKind = ARG_KIND_SURFACE_3D;
1882                          }
1883 
1884                      }
1885                      else if ((m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER) &&
1886                          m_args[index].unitKind != ARG_KIND_SURFACE2DUP_SAMPLER &&
1887                          (m_args[index].unitKind != ARG_KIND_SURFACE_3D))
1888                      {
1889                          CM_ASSERTMESSAGE("Error: Assign a Sampler surface to the arg which is previously 2D/3D surface.");
1890                          return CM_FAILURE;
1891                      }
1892                      break;
1893                  }
1894                  default:
1895                  {
1896                      CM_ASSERTMESSAGE("Error: Invalid surface type.");
1897                      return CM_INVALID_ARG_VALUE;
1898                  }
1899             }
1900         }
1901     }
1902     else if (m_args[index].unitKind == ARG_KIND_SAMPLER)
1903     {
1904         unsigned int numSamplers = m_args[index].unitSize / sizeof(int);
1905 
1906         if (numSamplers > 1)
1907         {
1908             size = numSamplers * sizeof(unsigned int);
1909 
1910             for (unsigned int i = 0; i < numSamplers; i++)
1911             {
1912                 SamplerIndex* samplerIndex = (SamplerIndex*)value + i;
1913                 samplerIdx = samplerIndex->get_data();
1914                 sampler_index_array.push_back(samplerIdx);
1915             }
1916         }
1917         else
1918         {
1919             SamplerIndex* samplerIndex = (SamplerIndex*)value;
1920             samplerIdx = ((SamplerIndex*)value)->get_data();
1921             size = sizeof(unsigned int);
1922             m_args[index].unitSize = (uint16_t)size;
1923             value = &samplerIdx;
1924         }
1925     }
1926 
1927 finish:
1928     if ( nArgType == CM_KERNEL_INTERNEL_ARG_PERKERNEL ) // per kernel arg
1929     {
1930         CM_ARG& arg = m_args[ index ];
1931 
1932         // Assume from now on, size is valid, i.e. confirmed with function signature
1933         if( !arg.value )
1934         {
1935             //Increment size kernel arguments will take up in CURBE
1936             uint32_t tempUnitSize = m_args[ index ].unitSize;
1937             if( (m_args[index].unitKind == ARG_KIND_SURFACE_VME ) ||
1938                 (m_args[index].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
1939                 (m_args[index].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ))
1940             {
1941                 tempUnitSize = CM_ARGUMENT_SURFACE_SIZE;
1942             }
1943 
1944             // first setKernelArg or first setKernelArg after each enqueue
1945             arg.value = MOS_NewArray(uint8_t,size);
1946             if( !arg.value )
1947             {
1948                 CM_ASSERTMESSAGE("Error: Out of system memory.");
1949                 return CM_OUT_OF_HOST_MEMORY;
1950             }
1951 
1952             arg.unitCount = 1;
1953 
1954             CmSafeMemCopy((void *)arg.value, value, size);
1955 
1956             if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
1957                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
1958                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
1959                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
1960                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
1961                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
1962                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
1963                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
1964                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
1965                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
1966                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
1967                  ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
1968             {
1969                 arg.surfIndex = MOS_NewArray(uint16_t, (size / sizeof(int32_t)));
1970                 if (!arg.surfIndex)
1971                 {
1972                     CM_ASSERTMESSAGE("Error: Out of system memory.");
1973                     MosSafeDeleteArray(arg.value);
1974                     return CM_OUT_OF_HOST_MEMORY;
1975                 }
1976                 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(int32_t) * sizeof(uint16_t));
1977                 CmSafeMemCopy((void *)arg.surfIndex, surfIndexValue, size / sizeof(int32_t) * sizeof(uint16_t));
1978             }
1979 
1980             if (m_args[index].unitKind == ARG_KIND_SAMPLER)
1981             {
1982                 for (unsigned int samplerIndex = 0; samplerIndex < sampler_index_array.size(); samplerIndex++)
1983                 {
1984                     *( (int *)arg.value + samplerIndex) = sampler_index_array[samplerIndex];
1985                 }
1986             }
1987 
1988             m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1989             arg.isDirty = true;
1990         }
1991         else
1992         {
1993             if( arg.unitCount != 1 )
1994             {
1995                 CM_ASSERTMESSAGE("Error: Invalid arg count.");
1996                 return CM_FAILURE;
1997             }
1998             if( memcmp( (void *)arg.value, value, size ) != 0 )
1999             {
2000                 CmSafeMemCopy((void *)arg.value, value, size);
2001                 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
2002                 arg.isDirty = true;
2003             }
2004             if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2005              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2006              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2007              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2008              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2009              ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2010              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2011              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2012              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2013              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2014              ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2015              ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2016             {
2017                 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(int32_t) * sizeof(uint16_t));
2018                 CmSafeMemCopy((void *)arg.surfIndex, surfIndexValue, size/sizeof(int32_t) * sizeof(uint16_t));
2019             }
2020 
2021             if (m_args[index].unitKind == ARG_KIND_SAMPLER)
2022             {
2023                 for (unsigned int samplerIndex = 0; samplerIndex < sampler_index_array.size(); samplerIndex++)
2024                 {
2025                     *((int *)arg.value + samplerIndex) = sampler_index_array[samplerIndex];
2026                 }
2027             }
2028         }
2029 
2030         m_perKernelArgExists = true;
2031     }
2032     else //per thread arg
2033     {
2034         CM_ARG& arg = m_args[ index ];
2035 
2036         // Assume from now on, size is valid, i.e. confirmed with function signature
2037         if( !arg.value )
2038         {
2039             //Increment size per-thread arguments will take up in payload of media object or media object walker commands
2040             m_sizeInPayload += arg.unitSize;
2041             DW_ALIGNMENT(m_sizeInPayload);
2042 
2043             // first setThreadArg or first setThreadArg after each enqueue
2044             arg.value = MOS_NewArray(uint8_t, (size * m_threadCount));
2045             if( !arg.value )
2046             {
2047                 CM_ASSERTMESSAGE("Error: Out of system memory.");
2048                 return CM_OUT_OF_HOST_MEMORY;
2049 
2050             }
2051             arg.unitCount = m_threadCount;
2052 
2053             uint32_t offset = size * nThreadID;
2054             uint8_t *threadValue = ( uint8_t *)arg.value;
2055             threadValue += offset;
2056 
2057             CmSafeMemCopy(threadValue, value, size);
2058             if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2059                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2060                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2061                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2062                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2063                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2064                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2065                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2066                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2067                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2068                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2069                  ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2070             {
2071                 arg.surfIndex = MOS_NewArray(uint16_t, (size / sizeof(uint32_t) * m_threadCount));
2072                 if( !arg.surfIndex )
2073                 {
2074                     CM_ASSERTMESSAGE("Error: Out of system memory.");
2075                     MosSafeDeleteArray(arg.value);
2076                     return CM_OUT_OF_HOST_MEMORY;
2077                 }
2078                 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(uint32_t) * sizeof(uint16_t) * m_threadCount);
2079                 CmSafeMemCopy((void *)(arg.surfIndex + size/sizeof(uint32_t)  * nThreadID), surfIndexValue, size/sizeof(uint32_t) * sizeof(uint16_t));
2080             }
2081             m_perThreadArgExists = true;
2082         }
2083         else
2084         {
2085             if( arg.unitCount != m_threadCount )
2086             {
2087                 CM_ASSERTMESSAGE("Error: arg count is not matched with thread count.");
2088                 return CM_FAILURE;
2089 
2090             }
2091             uint32_t offset = size * nThreadID;
2092             uint8_t *threadValue = ( uint8_t *)arg.value;
2093             threadValue += offset;
2094 
2095             if( memcmp( threadValue, value, size ) != 0 )
2096             {
2097                 CmSafeMemCopy(threadValue, value, size);
2098                 m_dirty |= CM_KERNEL_DATA_THREAD_ARG_DIRTY;
2099                 arg.isDirty = true;
2100             }
2101             if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2102                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2103                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2104                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2105                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2106                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2107                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2108                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2109                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2110                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2111                  ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2112                  ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2113             {
2114                 CmSafeMemCopy((void *)(arg.surfIndex + size/sizeof(uint32_t)  * nThreadID), surfIndexValue, size/sizeof(uint32_t) * sizeof(uint16_t));
2115             }
2116         }
2117     }
2118 
2119     m_args[index].isSet = true;
2120 
2121     return CM_SUCCESS;
2122 }
2123 
2124 //*-----------------------------------------------------------------------------
2125 //! Set per kernel arguments. The total size of all per kernel arguments and per thread
2126 //! arguments should be less than or equal to 256 Bytes (CM_MAX_ARG_SIZE_IN_BYTE).
2127 //! The life time of all per kernel arguments and per thread lasts untill the next enqueue
2128 //! i.e. after enqueue, ALL arguments need to be reset.
2129 //! INPUT:
2130 //!     1) Index of argument in CM kernel function (genx_main). The index is
2131 //!        global for per kernel arguments and per thread arguments.
2132 //!     2) Size of the argument.
2133 //!     3) Pointer to argument value.
2134 //! OUTPUT:
2135 //!     CM_SUCCESS or
2136 //!     CM_INVALID_ARG_INDEX if index is invalid;
2137 //!     CM_INVALID_ARG_SIZE if size is invalid;
2138 //!     CM_INVALID_ARG_VALUE if value is NULL.
2139 //*-----------------------------------------------------------------------------
SetKernelArg(uint32_t index,size_t size,const void * value)2140 CM_RT_API int32_t CmKernelRT::SetKernelArg(uint32_t index, size_t size, const void * value )
2141 {
2142     INSERT_API_CALL_LOG(GetHalState());
2143     //It should be mutual exclusive with Indirect Data
2144     if(m_kernelPayloadData)
2145     {
2146         CM_ASSERTMESSAGE("Error: SetKernelArg should be mutual exclusive with indirect data.");
2147         return CM_KERNELPAYLOAD_PERKERNELARG_MUTEX_FAIL;
2148     }
2149 
2150     if( index >= m_argCount )
2151     {
2152         CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
2153         return CM_INVALID_ARG_INDEX;
2154 
2155     }
2156 
2157     if( !value)
2158     {
2159         CM_ASSERTMESSAGE("Error: Invalid kernel arg value.");
2160         return CM_INVALID_ARG_VALUE;
2161     }
2162 
2163     if( size == 0)
2164     {
2165         CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
2166         return CM_INVALID_ARG_SIZE;
2167     }
2168 
2169     int32_t nRetVal = 0;
2170     if ( ( nRetVal = SetArgsInternal( CM_KERNEL_INTERNEL_ARG_PERKERNEL, index, size, value ) ) != CM_SUCCESS )
2171     {
2172         return nRetVal;
2173     }
2174 
2175     return CM_SUCCESS;
2176 }
2177 
SetKernelArgPointer(uint32_t index,size_t size,const void * value)2178 CM_RT_API int32_t CmKernelRT::SetKernelArgPointer(uint32_t index, size_t size, const void *value)
2179 {
2180     INSERT_API_CALL_LOG(GetHalState());
2181 
2182     //It should be mutual exclusive with Indirect Data
2183     if (m_kernelPayloadData)
2184     {
2185         CM_ASSERTMESSAGE("Error: SetKernelArg should be mutual exclusive with indirect data.");
2186         return CM_KERNELPAYLOAD_PERKERNELARG_MUTEX_FAIL;
2187     }
2188 
2189     if (index >= m_argCount)
2190     {
2191         CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
2192         return CM_INVALID_ARG_INDEX;
2193     }
2194 
2195     if (!value)
2196     {
2197         CM_ASSERTMESSAGE("Error: Invalid kernel arg value.");
2198         return CM_INVALID_ARG_VALUE;
2199     }
2200 
2201     uint64_t *argValue = MOS_NewArray(uint64_t, 1);
2202     if (!argValue)
2203     {
2204         CM_ASSERTMESSAGE("Error: Out of system memory.");
2205         return CM_OUT_OF_HOST_MEMORY;
2206     }
2207     CmSafeMemSet(argValue, 0, sizeof(uint64_t));
2208     CmSafeMemCopy(argValue, value, size);
2209 
2210     // Get the gfx start address of SVM/stateless buffer.
2211     uint64_t gfxAddress = *(argValue);
2212     MosSafeDeleteArray(argValue);
2213 
2214     // Check the gfx start address is valid or not
2215     std::set<CmSurface *> statelessSurfArray = m_surfaceMgr->GetStatelessSurfaceArray();
2216     bool valid = false;
2217     for(auto surface : statelessSurfArray)
2218     {
2219         CmBuffer_RT *buffer = static_cast<CmBuffer_RT *>(surface);
2220         uint64_t startAddress = 0;
2221         buffer->GetGfxAddress(startAddress);
2222         size_t size = buffer->GetSize();
2223 
2224         if (gfxAddress >= startAddress
2225             && gfxAddress < (startAddress + size))
2226         {
2227             SurfaceIndex *surfIndex = nullptr;
2228             buffer->GetIndex(surfIndex);
2229             uint32_t surfIndexData = surfIndex->get_data();
2230             m_surfaceArray[surfIndexData] = true;
2231 
2232             m_args[index].isStatelessBuffer = true;
2233             m_args[index].index = (uint16_t)surfIndexData;
2234 
2235             valid = true;
2236             break;
2237         }
2238     }
2239     if (!valid)
2240     {
2241         CM_ASSERTMESSAGE("Error: the kernel arg pointer is not valid.");
2242         return CM_INVALID_KERNEL_ARG_POINTER;
2243     }
2244 
2245     int32_t nRetVal = SetArgsInternal(CM_KERNEL_INTERNEL_ARG_PERKERNEL,
2246                                       index,
2247                                       size,
2248                                       value);
2249     if (nRetVal != CM_SUCCESS)
2250     {
2251         return nRetVal;
2252     }
2253 
2254     return CM_SUCCESS;
2255 }
2256 
2257 //*-----------------------------------------------------------------------------
2258 //| Purpose:   Set Static Buffer
2259 //| Return :   The result of operation
2260 //*-----------------------------------------------------------------------------
SetStaticBuffer(uint32_t index,const void * value)2261 CM_RT_API int32_t CmKernelRT::SetStaticBuffer(uint32_t index, const void * value )
2262 {
2263     INSERT_API_CALL_LOG(GetHalState());
2264     if(index >= CM_GLOBAL_SURFACE_NUMBER)
2265     {
2266         CM_ASSERTMESSAGE("Error: Surface Index exceeds max global surface number.");
2267         return CM_INVALID_GLOBAL_BUFFER_INDEX;
2268     }
2269 
2270     if(!value)
2271     {
2272         CM_ASSERTMESSAGE("Error: Invalid StaticBuffer arg value.");
2273         return CM_INVALID_BUFFER_HANDLER;
2274     }
2275 
2276     SurfaceIndex* surfIndex = (SurfaceIndex* )value;
2277     uint32_t surfIndexData = surfIndex->get_data();
2278     if (surfIndexData >= m_surfaceMgr->GetSurfacePoolSize())
2279     {
2280         CM_ASSERTMESSAGE("Error: StaticBuffer doesn't allow alias index.");
2281         return CM_INVALID_ARG_INDEX;
2282     }
2283 
2284     CmSurface* surface  = nullptr;
2285     m_surfaceMgr->GetSurface( surfIndexData, surface );
2286     if(surface == nullptr)
2287     {
2288         CM_ASSERTMESSAGE("Error: Invalid surface.");
2289         return CM_INVALID_BUFFER_HANDLER;
2290     }
2291 
2292     CmBuffer_RT* surf1D = nullptr;
2293     if ( surface->Type() == CM_ENUM_CLASS_TYPE_CMBUFFER_RT )
2294     {
2295         uint32_t handle = 0; // for 1D surf
2296 
2297         surf1D = static_cast< CmBuffer_RT* >( surface );
2298         surf1D->GetHandle( handle );
2299 
2300         if (m_globalSurfaces[index] == nullptr)
2301         {
2302             m_globalSurfaces[index] = MOS_New(SurfaceIndex,0);
2303             if( !m_globalSurfaces[index] )
2304             {
2305                 CM_ASSERTMESSAGE("Error: Out of system memory.");
2306                 return CM_OUT_OF_HOST_MEMORY;
2307             }
2308         }
2309         *m_globalSurfaces[index] = handle;
2310         m_globalCmIndex[index] = surfIndexData;
2311         m_dirty |= CM_KERNEL_DATA_GLOBAL_SURFACE_DIRTY;
2312     }
2313     else
2314     {
2315         CM_ASSERTMESSAGE("Error: StaticBuffer only supports CmBuffer type.");
2316          return CM_INVALID_BUFFER_HANDLER;
2317     }
2318     return CM_SUCCESS;
2319 }
2320 
2321 //*-----------------------------------------------------------------------------
2322 //! Set per thread arguments. The total size of all per kernel arguments and per thread
2323 //! arguments should be less than or equal to 256 Bytes
2324 //! The life time of all per kernel arguments and per thread lasts untill the next enqueue
2325 //! i.e. after enqueue, ALL arguments need to be reset.
2326 //! INPUT:
2327 //!     1) Thread index.
2328 //!     2) Index of argument in CM kernel function (genx_main). The index is
2329 //!        global for per kernel arguments and per thread arguments.
2330 //!     3) Size of the argument.
2331 //!     4) Pointer to argument .
2332 //! OUTPUT:
2333 //!     CM_SUCCESS or
2334 //!     CM_INVALID_ARG_INDEX if index is invalid
2335 //!     CM_INVALID_ARG_SIZE if size is invalid
2336 //!     CM_INVALID_ARG_VALUE if value is nullptr
2337 //*-----------------------------------------------------------------------------
SetThreadArg(uint32_t threadId,uint32_t index,size_t size,const void * value)2338 CM_RT_API int32_t CmKernelRT::SetThreadArg(uint32_t threadId, uint32_t index, size_t size, const void * value )
2339 {
2340     INSERT_API_CALL_LOG(GetHalState());
2341 
2342     //It should be mutual exclusive with Indirect Data
2343     if(m_kernelPayloadData)
2344     {
2345         CM_ASSERTMESSAGE("Error: SetThredArg should be mutual exclusive with indirect data.");
2346         return CM_KERNELPAYLOAD_PERTHREADARG_MUTEX_FAIL;
2347     }
2348 
2349     if(m_threadCount > m_halMaxValues->maxUserThreadsPerTask || m_threadCount <=0)
2350     {
2351         CM_ASSERTMESSAGE("Error: Minimum or Maximum number of threads exceeded.");
2352         return CM_FAILURE;
2353     }
2354 
2355     if( index >= m_argCount )
2356     {
2357         CM_ASSERTMESSAGE("Error: Invalid thread arg count.");
2358         return CM_INVALID_ARG_INDEX;
2359 
2360     }
2361 
2362     if( threadId >= m_threadCount )
2363     {
2364         CM_ASSERTMESSAGE("Error: thread id exceeds the threadcount.");
2365         return CM_INVALID_THREAD_INDEX;
2366 
2367     }
2368 
2369     if( !value)
2370     {
2371         CM_ASSERTMESSAGE("Error: Invalid thread arg value.");
2372         return CM_INVALID_ARG_VALUE;
2373     }
2374 
2375     if( size == 0)
2376     {
2377         CM_ASSERTMESSAGE("Error: Invalid thread arg size.");
2378         return CM_INVALID_ARG_SIZE;
2379     }
2380 
2381     int32_t nRetVal = 0;
2382     if ( ( nRetVal = SetArgsInternal( CM_KERNEL_INTERNEL_ARG_PERTHREAD, index, size, value, threadId ) ) != CM_SUCCESS )
2383     {
2384         return nRetVal;
2385     }
2386 
2387     return CM_SUCCESS;
2388 }
2389 
2390 //*-----------------------------------------------------------------------------
2391 //| Purpose:  Calculate the total size of kernel data
2392 //*-----------------------------------------------------------------------------
CalcKernelDataSize(uint32_t movInstNum,uint32_t numArgs,uint32_t argSize,uint32_t & totalKernelDataSize)2393 int32_t CmKernelRT::CalcKernelDataSize(
2394                 uint32_t movInstNum,                 // [in] the number of move instructions
2395                 uint32_t numArgs,                   // [in] number of args , surface array count
2396                 uint32_t argSize,                   // [in] Size of arguments
2397                 uint32_t & totalKernelDataSize)      // [out] total size of kernel data
2398 {
2399     int32_t hr             = CM_SUCCESS;
2400 
2401     uint32_t headSize = ( KERNEL_INFO_SIZE_IN_DWORD + numArgs * PER_ARG_SIZE_IN_DWORD ) * sizeof( uint32_t );
2402     uint32_t totalSize =  headSize + movInstNum * CM_MOVE_INSTRUCTION_SIZE + m_binarySize + argSize;
2403 
2404     totalSize += 4; // one dword for flag. the first bit is curbe on/off
2405     totalSize += 8; //sizeof( uint64_t ) for id
2406 
2407     totalSize += 16; // static buffer indices
2408     totalSize += 12; // GT Pin buffer indices
2409 
2410     ////////////////////////////////////////////////////////////////////////////
2411     // Calculate indirect data size (start)
2412     ////////////////////////////////////////////////////////////////////////////
2413     // Memory layout for indirect data:
2414     // Indirect Data Size -------------------- 2 bytes (must present)
2415     // Below area is present only if above value is not ZERO
2416     // Indirect Data Buffer ------------------ Size indicated above
2417     totalSize += sizeof(uint16_t);  //field for indirect data size
2418     if(m_usKernelPayloadDataSize)
2419     {
2420         totalSize += m_usKernelPayloadDataSize;
2421     }
2422     // Memory layout for indirect surface:
2423     // Indirect Surface Count ----------------- 2 bytes (must present)
2424     // Below are present only if the above value is not ZERO
2425     // Kind of Indirect Surface 0 ------------- 2 Bytes
2426     // Handle of Indirect Surface 0 ----------- 2 Bytes
2427     // Surface Index of Indirect Surface 0 ---- 2 Bytes
2428     // ..........
2429     // Kind of Indirect Surface n-1 ----------- 2 Bytes
2430     // Handle of Indirect Surface n-1---------- 2 Bytes
2431     // Surface Index of Indirect Surface n-1 -- 2 Bytes
2432     totalSize +=  sizeof(uint16_t); //field for indirect surface count
2433     if(m_usKernelPayloadSurfaceCount)
2434     {
2435         totalSize +=  m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO);
2436     }
2437 
2438     totalKernelDataSize = totalSize;
2439 
2440     return hr;
2441 }
2442 
2443 //*-----------------------------------------------------------------------------
2444 //| Purpose:   Create mov instructions
2445 //|            instructions will be copied into DstMem
2446 //*-----------------------------------------------------------------------------
CreateMovInstructions(uint32_t & movInstNum,uint8_t * & codeDst,CM_ARG * tempArgs,uint32_t numArgs)2447 int32_t CmKernelRT::CreateMovInstructions( uint32_t &movInstNum, uint8_t *&codeDst, CM_ARG* tempArgs, uint32_t numArgs)
2448 {
2449     //Create Mov Instruction
2450     CmDynamicArray      movInsts( numArgs );
2451     uint32_t renderGen = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState->platform.eRenderCoreFamily;
2452     CM_RETURN_CODE ret = m_movInstConstructor->SetInstDistanceConfig(movInsts.GetMaxSize(), renderGen);
2453     if (ret != CM_SUCCESS && ret != CM_NOT_IMPLEMENTED)
2454     {
2455         return ret;
2456     }
2457 
2458     movInstNum = 0;
2459 
2460     //Note: if no thread arg and no per kernel arg, no need move instrcutions at all.
2461     if( m_curbeEnabled && (m_perThreadArgExists || m_perKernelArgExists))
2462     {
2463         if( ( m_argCount > 0 ) && ( m_threadCount > 1) )
2464         {
2465             PCM_ARG* sortedArgs = MOS_NewArray(PCM_ARG,numArgs);
2466             if( !sortedArgs )
2467             {
2468                 CM_ASSERTMESSAGE("Error: Out of system memory.");
2469                 return CM_OUT_OF_HOST_MEMORY;
2470             }
2471             for( uint32_t j = 0; j < numArgs; j++ )
2472             {
2473                 sortedArgs[ j ] = tempArgs + j;
2474             }
2475             // sort arg to sortedArgs accorind to offsetinPayload
2476             QuickSort( sortedArgs, 0, numArgs - 1 );
2477 
2478             // record compiler generated offset, used as move dst later
2479             uint16_t *unitOffsetInPayloadSorted = MOS_NewArray(uint16_t, numArgs);
2480             if( !unitOffsetInPayloadSorted )
2481             {
2482                 CM_ASSERTMESSAGE("Error: Out of system memory.");
2483                 MosSafeDeleteArray(sortedArgs);
2484                 return CM_OUT_OF_HOST_MEMORY;
2485             }
2486             for( uint32_t j = 0; j < numArgs; j++ )
2487             {
2488                 unitOffsetInPayloadSorted[j] = sortedArgs[j]->unitOffsetInPayload;
2489             }
2490 
2491             uint16_t kernelArgEnd = 32;
2492             bool beforeFirstThreadArg = true;
2493             for( uint32_t j = 0; j < numArgs; j++ )
2494             {
2495                 if( sortedArgs[j]->unitCount == 1 )
2496                     // consider m_threadCount = 1 case later, where all args are treated as per thread arg
2497                 {
2498                     if( beforeFirstThreadArg )
2499                     {
2500                         kernelArgEnd = sortedArgs[j]->unitOffsetInPayload + sortedArgs[j]->unitSize;
2501                     }
2502                     else
2503                     {
2504                         DW_ALIGNMENT( kernelArgEnd ); // necessary ?
2505                         sortedArgs[j]->unitOffsetInPayload = kernelArgEnd;
2506                         kernelArgEnd += sortedArgs[j]->unitSize;
2507                     }
2508                 }
2509                 else // per thread
2510                 {
2511                     if( beforeFirstThreadArg )
2512                     {
2513                         beforeFirstThreadArg = false;
2514                     }
2515                 }
2516             }
2517 
2518             GRF_ALIGNMENT(kernelArgEnd); // offset of thread arg start related to R0
2519             uint32_t threadArgStart = kernelArgEnd;
2520 
2521             for (uint32_t j = 0; j < numArgs; j++)
2522             {
2523                 if (sortedArgs[j]->unitCount > 1) // per thread
2524                 {
2525                     sortedArgs[j]->unitOffsetInPayload = (uint16_t)threadArgStart;
2526                     threadArgStart += sortedArgs[j]->unitSize;
2527                     DW_ALIGNMENT(threadArgStart);
2528                 }
2529             }
2530 
2531             bool needMovInstructions = false;
2532             for( uint32_t j = 0; j < numArgs; j++ )
2533             {
2534                 if ( unitOffsetInPayloadSorted[j] != sortedArgs[j]->unitOffsetInPayload )
2535                 {
2536                     needMovInstructions = true;
2537                     break;
2538                 }
2539             }
2540 
2541             if (needMovInstructions)
2542             {
2543                 // Add move
2544                 GRF_ALIGNMENT(threadArgStart);
2545                 uint32_t threadArgEnd = threadArgStart;
2546                 uint32_t size = threadArgEnd - 32;
2547                 CM_ASSERT((size % 32) == 0);
2548 
2549                 // move all arguments starting from R1 (32 ) through threadArgEnd to R64 (R0 reserved for media dispatch)
2550                 uint32_t nextIndex = 0;
2551                 nextIndex += m_movInstConstructor->ConstructObjMovs(R64_OFFSET, 32, size, movInsts, nextIndex, true, m_blhwDebugEnable);
2552 
2553                 beforeFirstThreadArg = true;
2554                 for (uint32_t j = 0; j < numArgs; j++)
2555                 {
2556                     if (sortedArgs[j]->unitCount == 1)
2557                         // consider m_threadCount = 1 case later, where all args are treated as per thread arg
2558                     {
2559                         if (beforeFirstThreadArg == false)
2560                         {
2561                             // add move inst to move from sortedArgs[j]->unitOffsetInPayload + R64 to unitOffsetInPayloadSorted[j]
2562                             nextIndex += m_movInstConstructor->ConstructObjMovs(unitOffsetInPayloadSorted[j],
2563                                 R64_OFFSET + sortedArgs[j]->unitOffsetInPayload - 32,
2564                                 sortedArgs[j]->unitSize, movInsts, nextIndex, true, m_blhwDebugEnable);
2565                         }
2566                     }
2567                     else // per thread
2568                     {
2569                         if (beforeFirstThreadArg)
2570                         {
2571                             beforeFirstThreadArg = false;
2572                         }
2573 
2574                         // add move inst to move from sortedArgs[j]->unitOffsetInPayload + R64 to unitOffsetInPayloadSorted[j]
2575                         nextIndex += m_movInstConstructor->ConstructObjMovs(unitOffsetInPayloadSorted[j],
2576                             R64_OFFSET + sortedArgs[j]->unitOffsetInPayload - CM_PAYLOAD_OFFSET,
2577                             sortedArgs[j]->unitSize, movInsts, nextIndex, true, m_blhwDebugEnable);
2578                     }
2579                 }
2580 
2581                 movInstNum = nextIndex;
2582             }
2583 
2584             MosSafeDeleteArray(sortedArgs);
2585             MosSafeDeleteArray(unitOffsetInPayloadSorted);
2586         }
2587     }// End of if( m_curbeEnabled && m_ThreadArgExists)
2588 
2589     uint32_t addInstDW[4];
2590     MOS_ZeroMemory(addInstDW, CM_MOVE_INSTRUCTION_SIZE);
2591     uint32_t addInstNum =0;
2592 
2593     if(m_threadSpace && m_adjustScoreboardY)
2594     {
2595         addInstNum = 1;
2596 
2597         addInstDW[0] = CM_BDW_ADJUST_Y_SCOREBOARD_DW0;
2598         addInstDW[1] = CM_BDW_ADJUST_Y_SCOREBOARD_DW1;
2599         addInstDW[2] = CM_BDW_ADJUST_Y_SCOREBOARD_DW2;
2600 
2601         // constant word needs high 16 bits to be same as low 16 bits
2602         uint16_t tmp = - (int32_t)(m_adjustScoreboardY);
2603         addInstDW[3] = (tmp << 16) + tmp;
2604 
2605     }
2606 
2607     if (movInstNum || addInstNum)
2608     {
2609         codeDst = MOS_NewArray(uint8_t, ((movInstNum + addInstNum)  * CM_MOVE_INSTRUCTION_SIZE));
2610         if (!codeDst)
2611         {
2612             return CM_OUT_OF_HOST_MEMORY;
2613         }
2614     }
2615 
2616     for( uint32_t j = 0; j < movInstNum; j ++ )
2617     {
2618         MovInst_RT* movInst = (MovInst_RT*)movInsts.GetElement( j );
2619         if (!movInst)
2620         {
2621             CM_ASSERTMESSAGE("Error: Invalid move instructions.");
2622             MosSafeDeleteArray(codeDst);
2623             return CM_FAILURE;
2624         }
2625         if (j != 0)
2626         {
2627             movInst->ClearDebug();
2628         }
2629         CmSafeMemCopy(codeDst + j * CM_MOVE_INSTRUCTION_SIZE, movInst->GetBinary(), CM_MOVE_INSTRUCTION_SIZE);
2630         CmSafeDelete(movInst); // delete each element in movInsts
2631     }
2632     movInsts.Delete();
2633 
2634     if(addInstNum != 0)
2635     {
2636        CmSafeMemCopy(codeDst + movInstNum * CM_MOVE_INSTRUCTION_SIZE, addInstDW, CM_MOVE_INSTRUCTION_SIZE);
2637 
2638        movInstNum += addInstNum; // take add Y instruction into consideration
2639     }
2640 
2641     return CM_SUCCESS;
2642 }
2643 
CreateKernelArgDataGroup(uint8_t * & data,uint32_t value)2644 int32_t CmKernelRT::CreateKernelArgDataGroup(
2645     uint8_t   *&data,
2646     uint32_t   value)
2647 {
2648     if (data == nullptr)
2649     {
2650         data = MOS_NewArray(uint8_t, sizeof(uint32_t));
2651         if(!data)
2652         {
2653             return CM_OUT_OF_HOST_MEMORY;
2654         }
2655     }
2656     *(uint32_t *)data = value;
2657     return CM_SUCCESS;
2658 }
2659 
CreateKernelImplicitArgDataGroup(uint8_t * & data,uint32_t size)2660 int32_t CmKernelRT::CreateKernelImplicitArgDataGroup(
2661     uint8_t   *&data,
2662     uint32_t   size)
2663 {
2664     data = MOS_NewArray(uint8_t, (size * sizeof(uint32_t)));
2665     if (!data)
2666     {
2667         return CM_OUT_OF_HOST_MEMORY;
2668     }
2669     *(uint32_t *)data = 0;
2670     return CM_SUCCESS;
2671 }
2672 
2673 //*-----------------------------------------------------------------------------
2674 //| Purpose:   Create mov instructions
2675 //|            instructions will be copied into DstMem
2676 //*-----------------------------------------------------------------------------
CreateThreadArgData(PCM_HAL_KERNEL_ARG_PARAM kernelArg,uint32_t threadArgIndex,CmThreadSpaceRT * threadSpace,CM_ARG * cmArgs)2677 int32_t CmKernelRT::CreateThreadArgData(
2678     PCM_HAL_KERNEL_ARG_PARAM    kernelArg,
2679     uint32_t                    threadArgIndex,
2680     CmThreadSpaceRT*              threadSpace,
2681     CM_ARG*                     cmArgs )
2682 {
2683     int32_t         hr              = CM_SUCCESS;
2684     uint32_t        threadArgCount  = cmArgs[ threadArgIndex].unitCount;
2685     uint32_t        threadArgSize   = cmArgs[ threadArgIndex ].unitSize;
2686 
2687     if (CHECK_SURFACE_TYPE(cmArgs->unitKind,  ARG_KIND_SURFACE_VME))
2688     {
2689         // reallocate the memory since the number of surfaces in a vme surface could vary
2690         MosSafeDeleteArray(kernelArg->firstValue);
2691     }
2692 
2693     if( kernelArg->firstValue  == nullptr)
2694     {
2695         // if firstValue = nullptr, then create a new one, otherwise, update the exisitng one
2696         kernelArg->firstValue = MOS_NewArray(uint8_t, (cmArgs[threadArgIndex].unitCount * cmArgs[threadArgIndex].unitSize));
2697         if( !kernelArg->firstValue )
2698         {
2699             hr = CM_OUT_OF_HOST_MEMORY;
2700             goto finish;
2701         }
2702     }
2703 
2704     if(kernelArg->unitCount == 1 ) // kernel arg
2705     {
2706         if (cmArgs[threadArgIndex].value)
2707         {
2708             CmSafeMemCopy(kernelArg->firstValue, cmArgs[threadArgIndex].value, threadArgCount * threadArgSize);
2709         }
2710         goto finish;
2711     }
2712 
2713     if( threadSpace != nullptr )
2714     {
2715         CM_DEPENDENCY_PATTERN dependencyPatternType = CM_NONE_DEPENDENCY;
2716         threadSpace->GetDependencyPatternType(dependencyPatternType);
2717 
2718         if ((m_threadSpaceAssociated == true) &&  (dependencyPatternType != CM_NONE_DEPENDENCY))
2719         {
2720             CM_THREAD_SPACE_UNIT *threadSpaceUnit = nullptr;
2721             threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
2722 
2723             uint32_t *boardOrder = nullptr;
2724             threadSpace->GetBoardOrder(boardOrder);
2725 
2726             for (uint32_t index = 0; index < threadArgCount; index++)
2727             {
2728                 uint32_t offset = threadSpaceUnit[boardOrder[index]].threadId;
2729                 uint8_t *argSrc = (uint8_t*)cmArgs[threadArgIndex].value + offset * threadArgSize;
2730                 uint8_t *argDst = kernelArg->firstValue + index * threadArgSize;
2731                 CmSafeMemCopy(argDst, argSrc, threadArgSize);
2732             }
2733         }
2734         else
2735         {
2736            CmSafeMemCopy(kernelArg->firstValue, cmArgs[ threadArgIndex ].value, threadArgCount * threadArgSize);
2737         }
2738     }
2739     else
2740     {
2741         CmSafeMemCopy(kernelArg->firstValue, cmArgs[ threadArgIndex ].value, threadArgCount * threadArgSize);
2742     }
2743 
2744 finish:
2745     return hr;
2746 }
2747 
2748 //*-----------------------------------------------------------------------------
2749 //| Purpose:   Sort thread space for scorboarding
2750 //*-----------------------------------------------------------------------------
SortThreadSpace(CmThreadSpaceRT * threadSpace)2751 int32_t CmKernelRT::SortThreadSpace( CmThreadSpaceRT*  threadSpace )
2752 {
2753     int32_t                   hr = CM_SUCCESS;
2754     CM_DEPENDENCY_PATTERN dependencyPatternType = CM_NONE_DEPENDENCY;
2755 
2756     CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpace);
2757 
2758     threadSpace->GetDependencyPatternType(dependencyPatternType);
2759 
2760     if(!threadSpace->IsThreadAssociated())
2761     {//Skip Sort if it is media walker
2762         return CM_SUCCESS;
2763     }
2764 
2765     if (threadSpace->CheckDependencyVectorsSet())
2766     {
2767         threadSpace->WavefrontDependencyVectors();
2768     }
2769     else
2770     {
2771         switch (dependencyPatternType)
2772         {
2773             case CM_WAVEFRONT:
2774                 threadSpace->Wavefront45Sequence();
2775                 break;
2776 
2777             case CM_WAVEFRONT26:
2778                 threadSpace->Wavefront26Sequence();
2779                 break;
2780 
2781             case CM_WAVEFRONT26Z:
2782                 threadSpace->Wavefront26ZSequence();
2783                 break;
2784 
2785             case CM_WAVEFRONT26ZI:
2786                 CM_26ZI_DISPATCH_PATTERN dispatchPattern;
2787                 threadSpace->Get26ZIDispatchPattern(dispatchPattern);
2788                 switch (dispatchPattern)
2789                 {
2790                 case VVERTICAL_HVERTICAL_26:
2791                     threadSpace->Wavefront26ZISeqVVHV26();
2792                     break;
2793                 case VVERTICAL_HHORIZONTAL_26:
2794                     threadSpace->Wavefront26ZISeqVVHH26();
2795                     break;
2796                 case VVERTICAL26_HHORIZONTAL26:
2797                     threadSpace->Wavefront26ZISeqVV26HH26();
2798                     break;
2799                 case VVERTICAL1X26_HHORIZONTAL1X26:
2800                     threadSpace->Wavefront26ZISeqVV1x26HH1x26();
2801                     break;
2802                 default:
2803                     threadSpace->Wavefront26ZISeqVVHV26();
2804                     break;
2805                 }
2806                 break;
2807 
2808             case CM_HORIZONTAL_WAVE:
2809                 threadSpace->HorizentalSequence();
2810                 break;
2811 
2812             case CM_VERTICAL_WAVE:
2813                 threadSpace->VerticalSequence();
2814                 break;
2815 
2816             case CM_NONE_DEPENDENCY:
2817             case CM_WAVEFRONT26X:
2818             case CM_WAVEFRONT26ZIG:
2819                 break;
2820 
2821             default:
2822                 CM_ASSERTMESSAGE("Error: Invalid thread dependency type.");
2823                 hr = CM_FAILURE;
2824                 break;
2825         }
2826     }
2827 
2828 finish:
2829     return hr;
2830 }
2831 
2832 //*-----------------------------------------------------------------------------
2833 //| Purpose:   Create temp args array with surface array broken down
2834 //|            instructions will be copied into DstMem
2835 //*-----------------------------------------------------------------------------
CreateTempArgs(uint32_t numArgs,CM_ARG * & tempArgs)2836 int32_t CmKernelRT::CreateTempArgs(
2837     uint32_t     numArgs,
2838     CM_ARG*      &tempArgs)
2839 {
2840     int32_t     hr              = CM_SUCCESS;
2841     int32_t     numSurfaces    = 0;
2842     int32_t     increasedArgs  = 0;
2843 
2844     if( numArgs < m_argCount || tempArgs != nullptr )
2845     {
2846         CM_ASSERTMESSAGE("Error: Invalid arg number or arg value.");
2847         hr = CM_FAILURE;
2848         goto finish;
2849     }
2850 
2851     tempArgs = MOS_NewArray(CM_ARG, numArgs);
2852     CM_CHK_NULL_GOTOFINISH(tempArgs, CM_OUT_OF_HOST_MEMORY);
2853     CmSafeMemSet(tempArgs, 0, numArgs* sizeof(CM_ARG) );
2854 
2855     for( uint32_t j = 0; j < m_argCount; j++ )
2856     {
2857         if ( CHECK_SURFACE_TYPE( m_args[ j ].unitKind, // first time
2858                                 ARG_KIND_SURFACE,
2859                                 ARG_KIND_SURFACE_1D,
2860                                 ARG_KIND_SURFACE_2D,
2861                                 ARG_KIND_SURFACE_2D_UP,
2862                                 ARG_KIND_SURFACE_SAMPLER,
2863                                 ARG_KIND_SURFACE2DUP_SAMPLER,
2864                                 ARG_KIND_SURFACE_3D,
2865                                 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
2866                                 ARG_KIND_SURFACE_SAMPLER8X8_VA,
2867                                 ARG_KIND_SURFACE_2D_SCOREBOARD,
2868                                 ARG_KIND_STATE_BUFFER ) )
2869         {
2870             numSurfaces = m_args[j].unitSize/sizeof(int);
2871 
2872             if (numSurfaces > 1)
2873             {
2874                 if (m_args[j].unitCount == 1)
2875                 { //Kernel arg
2876                     for (int32_t k = 0; k < numSurfaces; k++)
2877                     {
2878                         tempArgs[j + increasedArgs + k] = m_args[j];
2879                         tempArgs[j + increasedArgs + k].unitSize = sizeof(int32_t);
2880                         tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int32_t);
2881                         tempArgs[j + increasedArgs + k].value = (uint8_t *)((uint32_t *)m_args[j].value + k);
2882                         tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2883                         tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2884                         //For each surface kind and custom value  in surface array
2885                         if (!m_args[j].surfIndex[k])
2886                         {
2887                             //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
2888                             //This is for special usage if there is empty element in surface array.
2889                             tempArgs[j + increasedArgs + k].unitKind = CM_ARGUMENT_SURFACE2D;
2890                             continue;
2891                         }
2892                         tempArgs[j + increasedArgs + k].unitKind = m_args[j].surfArrayArg[k].argKindForArray;
2893                         tempArgs[j + increasedArgs + k].nCustomValue = m_args[j].surfArrayArg[k].addressModeForArray;
2894                     }
2895                 }
2896                 else
2897                 {
2898                     uint32_t *surfaces = (uint32_t *)MOS_NewArray(uint8_t, ((sizeof(int32_t) * m_args[j].unitCount)));
2899                     CM_CHK_NULL_GOTOFINISH(surfaces, CM_OUT_OF_HOST_MEMORY);
2900                     for (int32_t k = 0; k < numSurfaces; k++)
2901                     {
2902                         tempArgs[j + increasedArgs + k] = m_args[j];
2903                         tempArgs[j + increasedArgs + k].unitSize = sizeof(int32_t);
2904                         tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int32_t);
2905                         tempArgs[j + increasedArgs + k].value = MOS_NewArray(uint8_t, ((sizeof(int32_t) * m_args[j].unitCount)));
2906                         if(tempArgs[j + increasedArgs + k].value == nullptr)
2907                         {
2908                             CM_ASSERTMESSAGE("Error: Out of system memory.");
2909                             hr = CM_OUT_OF_HOST_MEMORY;
2910                             MosSafeDeleteArray(surfaces);
2911                             goto finish;
2912                         }
2913                         for (uint32_t s = 0; s < m_args[j].unitCount; s++)
2914                         {
2915                             surfaces[s] = *(uint32_t *)((uint32_t *)m_args[j].value + k + numSurfaces * s);
2916                         }
2917                         CmSafeMemCopy(tempArgs[j + increasedArgs + k].value, surfaces, sizeof(int32_t) * m_args[j].unitCount);
2918                         tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2919                         tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = (uint16_t)-1;
2920                     }
2921                     MosSafeDeleteArray(surfaces);
2922                 }
2923                 increasedArgs += numSurfaces - 1;
2924             }
2925             else
2926             {
2927                 tempArgs[j + increasedArgs] = m_args[j];
2928             }
2929         }
2930         else if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
2931         {
2932             numSurfaces = m_args[ j ].unitVmeArraySize;
2933             if(numSurfaces == 1)
2934             {  // single vme surface
2935                tempArgs[j + increasedArgs] = m_args[j];
2936             }
2937             else
2938             {  // multiple vme surfaces in surface array
2939                 if (m_args[j].unitCount == 1) { //Kernel arg
2940                     uint32_t vmeSurfOffset = 0;
2941 
2942                     for (int32_t k = 0; k < numSurfaces; k++)
2943                     {
2944                         uint16_t vmeSize = (uint16_t)getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[j].value + vmeSurfOffset));
2945 
2946                         tempArgs[j + increasedArgs + k] = m_args[j];
2947                         tempArgs[j + increasedArgs + k].unitSize = vmeSize;
2948                         tempArgs[j + increasedArgs + k].unitSizeOrig = vmeSize;
2949                         tempArgs[j + increasedArgs + k].value = (uint8_t *)(m_args[j].value + vmeSurfOffset);
2950                         tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + k*4;
2951                         tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2952 
2953                         vmeSurfOffset += vmeSize;
2954                     }
2955                 }
2956              }
2957             increasedArgs += numSurfaces - 1;
2958         }
2959         else if (m_args[j].unitKind == ARG_KIND_SAMPLER)
2960         {
2961             unsigned int numSamplers = m_args[j].unitSize / sizeof(int);
2962 
2963             if (numSamplers > 1)
2964             {
2965                 if (m_args[j].unitCount == 1)
2966                 {
2967                     //Kernel arg
2968                     for (unsigned int k = 0; k < numSamplers; k++)
2969                     {
2970                         tempArgs[j + increasedArgs + k] = m_args[j];
2971                         tempArgs[j + increasedArgs + k].unitSize = sizeof(int);
2972                         tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int);
2973                         tempArgs[j + increasedArgs + k].value = (unsigned char *)((unsigned int *)m_args[j].value + k);
2974                         tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2975                         tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2976                         tempArgs[j + increasedArgs + k].unitKind = CM_ARGUMENT_SAMPLER;
2977                     }
2978                 }
2979                 else
2980                 {
2981                     // Use sampler index array as thread arg.
2982                     // Not implemented yet.
2983                     return CM_NOT_IMPLEMENTED;
2984                 }
2985                 increasedArgs += numSamplers - 1;
2986             }
2987             else
2988             {
2989                 tempArgs[j + increasedArgs] = m_args[j];
2990             }
2991         }
2992         else
2993         {
2994             tempArgs[j + increasedArgs] = m_args[j];
2995         }
2996     }
2997 
2998 finish:
2999     if(hr == CM_OUT_OF_HOST_MEMORY)
3000     {
3001         if(tempArgs)
3002         {
3003             for (uint32_t j = 0; j < numArgs; j++)
3004             {
3005                 MosSafeDeleteArray(tempArgs[j].value);
3006             }
3007         }
3008         MosSafeDeleteArray( tempArgs );
3009     }
3010     return hr;
3011 }
3012 
3013 //*-----------------------------------------------------------------------------
3014 //| Purpose:   Get the number of args includes the num of surfaces in surface array
3015 //*-----------------------------------------------------------------------------
GetArgCountPlusSurfArray(uint32_t & argSize,uint32_t & argCountPlus)3016 int32_t CmKernelRT::GetArgCountPlusSurfArray(uint32_t &argSize, uint32_t & argCountPlus)
3017 {
3018     argCountPlus = m_argCount;
3019     argSize      = 0;
3020 
3021     if(m_usKernelPayloadDataSize)
3022     { // if payload data exists, the number of args is zero
3023         argCountPlus  = 0;
3024         argSize       = 0;
3025         return CM_SUCCESS;
3026     }
3027 
3028     if( m_argCount != 0 )   //Need pass the arg either by arguments area, or by indirect payload area
3029     {
3030          //Sanity check for argument setting
3031         if((m_perThreadArgExists == false) && (m_perKernelArgExists == false) && (m_usKernelPayloadDataSize == 0))
3032         {
3033             if ( m_stateBufferBounded == CM_STATE_BUFFER_NONE )
3034             {
3035                 CM_ASSERTMESSAGE( "Error: Kernel arguments are not set." );
3036                 return CM_NOT_SET_KERNEL_ARGUMENT;
3037             }
3038         }
3039 
3040         if(m_perThreadArgExists || m_perKernelArgExists)
3041         {
3042             unsigned int extraArgs = 0;
3043 
3044             for( uint32_t j = 0; j < m_argCount; j ++ )
3045             {
3046                 //Sanity checking for every argument setting
3047                 if ( !m_args[j].isSet )
3048                 {
3049                     CM_ASSERTMESSAGE("Error: One Kernel argument is not set.");
3050                     return CM_KERNEL_ARG_SETTING_FAILED;
3051                 }
3052 
3053                 argSize += m_args[j].unitSize * m_args[j].unitCount;
3054 
3055                 if ( CHECK_SURFACE_TYPE( m_args[ j ].unitKind,
3056                                         ARG_KIND_SURFACE,
3057                                         ARG_KIND_SURFACE_1D,
3058                                         ARG_KIND_SURFACE_2D,
3059                                         ARG_KIND_SURFACE_2D_UP,
3060                                         ARG_KIND_SURFACE_SAMPLER,
3061                                         ARG_KIND_SURFACE2DUP_SAMPLER,
3062                                         ARG_KIND_SURFACE_3D,
3063                                         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
3064                                         ARG_KIND_SURFACE_SAMPLER8X8_VA,
3065                                         ARG_KIND_SURFACE_2D_SCOREBOARD,
3066                                         ARG_KIND_STATE_BUFFER ) )
3067                 {
3068                      int numSurfaces = m_args[j].unitSize/sizeof(int);
3069                      if (numSurfaces > 1) {
3070                            extraArgs += numSurfaces - 1;
3071                      }
3072                 }
3073                 else if (CHECK_SURFACE_TYPE(m_args[j].unitKind, ARG_KIND_SURFACE_VME))
3074                 {
3075                     int numSurfaces = m_args[j].unitVmeArraySize;
3076                     if (numSurfaces > 1) {
3077                         extraArgs += numSurfaces - 1;
3078                     }
3079                 }
3080                 else if (m_args[j].unitKind == ARG_KIND_SAMPLER)
3081                 {
3082                     int numSamplers = m_args[j].unitSize / sizeof(int);
3083                     if (numSamplers > 1)
3084                     {
3085                         extraArgs += (numSamplers - 1);
3086                     }
3087                 }
3088             }
3089 
3090             argCountPlus = m_argCount + extraArgs;
3091         }
3092     }
3093     return CM_SUCCESS;
3094 }
3095 
3096 //*-----------------------------------------------------------------------------
3097 //| Purpose:   Create Thread Space Param
3098 //*-----------------------------------------------------------------------------
CreateThreadSpaceParam(PCM_HAL_KERNEL_THREADSPACE_PARAM kernelThreadSpaceParam,CmThreadSpaceRT * threadSpace)3099 int32_t CmKernelRT::CreateThreadSpaceParam(
3100     PCM_HAL_KERNEL_THREADSPACE_PARAM kernelThreadSpaceParam,
3101     CmThreadSpaceRT*                   threadSpace     )
3102 {
3103     int32_t                      hr = CM_SUCCESS;
3104     CM_HAL_DEPENDENCY*           dependency = nullptr;
3105     uint32_t                     threadSpaceWidth = 0;
3106     uint32_t                     threadSpaceHeight =0;
3107     CM_THREAD_SPACE_UNIT         *threadSpaceUnit = nullptr;
3108     CM_THREAD_SPACE_DIRTY_STATUS dirtyStatus = CM_THREAD_SPACE_CLEAN;
3109 
3110     if (kernelThreadSpaceParam == nullptr || threadSpace == nullptr)
3111     {
3112         CM_ASSERTMESSAGE("Error: Pointer to CmKernelThreadSpaceParam or thread space is null.");
3113         hr = CM_NULL_POINTER;
3114         goto finish;
3115     }
3116 
3117     threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
3118     kernelThreadSpaceParam->threadSpaceWidth =  (uint16_t)threadSpaceWidth;
3119     kernelThreadSpaceParam->threadSpaceHeight = (uint16_t)threadSpaceHeight;
3120 
3121     threadSpace->GetDependencyPatternType(kernelThreadSpaceParam->patternType);
3122     threadSpace->GetWalkingPattern(kernelThreadSpaceParam->walkingPattern);
3123     threadSpace->GetDependency( dependency);
3124 
3125     if(dependency != nullptr)
3126     {
3127         CmSafeMemCopy(&kernelThreadSpaceParam->dependencyInfo, dependency, sizeof(CM_HAL_DEPENDENCY));
3128     }
3129 
3130     if( threadSpace->CheckWalkingParametersSet( ) )
3131     {
3132         kernelThreadSpaceParam->walkingParamsValid = 1;
3133         CM_CHK_CMSTATUS_GOTOFINISH(threadSpace->GetWalkingParameters(kernelThreadSpaceParam->walkingParams));
3134     }
3135     else
3136     {
3137         kernelThreadSpaceParam->walkingParamsValid = 0;
3138     }
3139 
3140     if( threadSpace->CheckDependencyVectorsSet( ) )
3141     {
3142         kernelThreadSpaceParam->dependencyVectorsValid = 1;
3143         CM_CHK_CMSTATUS_GOTOFINISH(threadSpace->GetDependencyVectors(kernelThreadSpaceParam->dependencyVectors));
3144     }
3145     else
3146     {
3147         kernelThreadSpaceParam->dependencyVectorsValid = 0;
3148     }
3149 
3150     threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
3151 
3152     if(threadSpaceUnit)
3153     {
3154         kernelThreadSpaceParam->threadCoordinates = MOS_NewArray(CM_HAL_SCOREBOARD, (threadSpaceWidth * threadSpaceHeight));
3155         CM_CHK_NULL_GOTOFINISH(kernelThreadSpaceParam->threadCoordinates , CM_OUT_OF_HOST_MEMORY);
3156         CmSafeMemSet(kernelThreadSpaceParam->threadCoordinates, 0, threadSpaceHeight * threadSpaceWidth * sizeof(CM_HAL_SCOREBOARD));
3157 
3158         uint32_t *boardOrder = nullptr;
3159         threadSpace->GetBoardOrder(boardOrder);
3160         CM_CHK_NULL_GOTOFINISH_CMERROR(boardOrder);
3161 
3162         kernelThreadSpaceParam->reuseBBUpdateMask  = 0;
3163         for(uint32_t i=0; i< threadSpaceWidth * threadSpaceHeight ; i++)
3164         {
3165             kernelThreadSpaceParam->threadCoordinates[i].x = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.x;
3166             kernelThreadSpaceParam->threadCoordinates[i].y = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.y;
3167             kernelThreadSpaceParam->threadCoordinates[i].mask = threadSpaceUnit[boardOrder[i]].dependencyMask;
3168             kernelThreadSpaceParam->threadCoordinates[i].resetMask= threadSpaceUnit[boardOrder[i]].reset;
3169             kernelThreadSpaceParam->threadCoordinates[i].color = threadSpaceUnit[boardOrder[i]].scoreboardColor;
3170             kernelThreadSpaceParam->threadCoordinates[i].sliceSelect = threadSpaceUnit[boardOrder[i]].sliceDestinationSelect;
3171             kernelThreadSpaceParam->threadCoordinates[i].subSliceSelect = threadSpaceUnit[boardOrder[i]].subSliceDestinationSelect;
3172             kernelThreadSpaceParam->reuseBBUpdateMask |= threadSpaceUnit[boardOrder[i]].reset;
3173         }
3174 
3175         if( kernelThreadSpaceParam->patternType == CM_WAVEFRONT26Z )
3176         {
3177             CM_HAL_WAVEFRONT26Z_DISPATCH_INFO dispatchInfo;
3178             threadSpace->GetWavefront26ZDispatchInfo(dispatchInfo);
3179 
3180             kernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
3181             kernelThreadSpaceParam->dispatchInfo.numThreadsInWave = MOS_NewArray(uint32_t, dispatchInfo.numWaves);
3182             CM_CHK_NULL_GOTOFINISH(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave, CM_OUT_OF_HOST_MEMORY);
3183             CmSafeMemCopy(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave,
3184                 dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
3185 
3186          }
3187     }
3188 
3189     //Get group select setting information
3190     threadSpace->GetMediaWalkerGroupSelect(kernelThreadSpaceParam->groupSelect);
3191 
3192     //Get color count
3193     threadSpace->GetColorCountMinusOne(kernelThreadSpaceParam->colorCountMinusOne);
3194 
3195     dirtyStatus = threadSpace->GetDirtyStatus();
3196     switch (dirtyStatus)
3197     {
3198     case CM_THREAD_SPACE_CLEAN:
3199         kernelThreadSpaceParam->bbDirtyStatus = CM_HAL_BB_CLEAN;
3200         break;
3201     default:
3202         kernelThreadSpaceParam->bbDirtyStatus = CM_HAL_BB_DIRTY;
3203         break;
3204     }
3205 
3206 finish:
3207     if( hr == CM_OUT_OF_HOST_MEMORY)
3208     {
3209         if( kernelThreadSpaceParam )
3210         {
3211             MosSafeDeleteArray(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave);
3212             MosSafeDeleteArray(kernelThreadSpaceParam->threadCoordinates);
3213         }
3214     }
3215 
3216     return hr;
3217 }
3218 
3219 //*-----------------------------------------------------------------------------
3220 //| Purpose:   Delete the args array
3221 //*-----------------------------------------------------------------------------
DestroyArgs(void)3222 int32_t CmKernelRT::DestroyArgs( void )
3223 {
3224     for( uint32_t i =0 ; i < m_argCount; i ++ )
3225     {
3226         CM_ARG& arg = m_args[ i ];
3227         MosSafeDeleteArray( arg.value );
3228         MosSafeDeleteArray(arg.surfIndex);
3229         MosSafeDeleteArray(arg.surfArrayArg);
3230         arg.unitCount = 0;
3231         arg.unitSize = 0;
3232         arg.unitKind = 0;
3233         arg.unitOffsetInPayload = 0;
3234         arg.isDirty = true;
3235         arg.isSet = false;
3236     }
3237 
3238     MosSafeDeleteArray( m_args );
3239 
3240     m_threadSpaceAssociated        = false;
3241     m_threadSpace          = nullptr;
3242 
3243     m_perThreadArgExists  = false;
3244     m_perKernelArgExists  = false;
3245 
3246     m_sizeInCurbe = 0;
3247     m_curbeEnabled = true;
3248 
3249     m_sizeInPayload = 0;
3250     m_adjustScoreboardY = 0;
3251 
3252     ResetKernelSurfaces();
3253 
3254     return CM_SUCCESS;
3255 }
3256 
3257 //*-----------------------------------------------------------------------------
3258 // Calling reset makes it possible to change the per kernel or per thread
3259 // property of the argurments b/c it reset releases the memory for arguments
3260 //*-----------------------------------------------------------------------------
Reset(void)3261 int32_t CmKernelRT::Reset( void )
3262 {
3263     for( uint32_t i =0 ; i < m_argCount; i ++ )
3264     {
3265         CM_ARG& arg = m_args[ i ];
3266         MosSafeDeleteArray( arg.value );
3267         MosSafeDeleteArray( arg.surfIndex);
3268         MosSafeDeleteArray(arg.surfArrayArg);
3269         arg.value = nullptr;
3270         arg.unitCount = 0;
3271 
3272         arg.unitSize = arg.unitSizeOrig;
3273         arg.unitKind = arg.unitKindOrig;
3274         arg.unitOffsetInPayload = arg.unitOffsetInPayloadOrig;
3275 
3276         arg.isDirty = true;
3277         arg.isSet = false;
3278         arg.unitVmeArraySize = 0;
3279 
3280         arg.isStatelessBuffer = false;
3281         arg.index = 0;
3282     }
3283 
3284     m_threadCount = 0;
3285 
3286     m_indexInTask = 0;
3287 
3288     m_perThreadArgExists = false;
3289     m_perKernelArgExists = false;
3290 
3291     m_sizeInCurbe = 0;
3292     m_curbeEnabled = true;
3293 
3294     m_sizeInPayload = 0;
3295 
3296     m_threadSpaceAssociated = false;
3297     m_threadSpace = nullptr;
3298     m_adjustScoreboardY = 0;
3299 
3300     m_threadGroupSpace = nullptr;
3301 
3302     MosSafeDeleteArray(m_kernelPayloadData);
3303     m_usKernelPayloadDataSize = 0;
3304 
3305     if (m_usKernelPayloadSurfaceCount)
3306     {
3307         CmSafeMemSet(m_pKernelPayloadSurfaceArray, 0, m_usKernelPayloadSurfaceCount * sizeof(SurfaceIndex *));
3308         CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
3309         m_usKernelPayloadSurfaceCount = 0;
3310     }
3311 
3312     ResetKernelSurfaces();
3313 
3314     return CM_SUCCESS;
3315 }
3316 
3317 //*-----------------------------------------------------------------------------
3318 //| Purpose:   Get the pointer to arguments array
3319 //*-----------------------------------------------------------------------------
GetArgs(CM_ARG * & arg)3320 int32_t CmKernelRT::GetArgs( CM_ARG* & arg )
3321 {
3322     arg = m_args;
3323     return CM_SUCCESS;
3324 }
3325 
3326 //*-----------------------------------------------------------------------------
3327 //| Purpose:   Get the arguments' count
3328 //*-----------------------------------------------------------------------------
GetArgCount(uint32_t & argCount)3329 int32_t CmKernelRT::GetArgCount( uint32_t & argCount )
3330 {
3331     argCount = m_argCount;
3332     return CM_SUCCESS;
3333 }
3334 
3335 //*-----------------------------------------------------------------------------
3336 //| Purpose:    Get the value of member CurbeEnable
3337 //*-----------------------------------------------------------------------------
GetCurbeEnable(bool & b)3338 int32_t CmKernelRT::GetCurbeEnable( bool& b )
3339 {
3340     b = m_curbeEnabled;
3341     return CM_SUCCESS;
3342 }
3343 
3344 //*-----------------------------------------------------------------------------
3345 //| Purpose:    Set the CurbeEnable member
3346 //*-----------------------------------------------------------------------------
SetCurbeEnable(bool b)3347 int32_t CmKernelRT::SetCurbeEnable( bool b )
3348 {
3349     m_curbeEnabled = b;
3350     return CM_SUCCESS;
3351 }
3352 
3353 //*-----------------------------------------------------------------------------
3354 //| Purpose:   Get the kernel's size in Curbe
3355 //*-----------------------------------------------------------------------------
GetSizeInCurbe(uint32_t & size)3356 int32_t CmKernelRT::GetSizeInCurbe( uint32_t& size )
3357 {
3358     size = m_sizeInCurbe;
3359     return CM_SUCCESS;
3360 }
3361 
3362 //*-----------------------------------------------------------------------------
3363 //| Purpose:   Get the total size in payload of meida object or media walker
3364 //*-----------------------------------------------------------------------------
GetSizeInPayload(uint32_t & size)3365 int32_t CmKernelRT::GetSizeInPayload( uint32_t& size )
3366 {
3367     size = m_sizeInPayload;
3368     return CM_SUCCESS;
3369 }
3370 
3371 //*-----------------------------------------------------------------------------
3372 //| Purpose:    Get the pointer to CM device
3373 //*-----------------------------------------------------------------------------
GetCmDevice(CmDeviceRT * & device)3374 int32_t CmKernelRT::GetCmDevice(CmDeviceRT* &device)
3375 {
3376     device = m_device;
3377     return CM_SUCCESS;
3378 }
3379 
GetCmProgram(CmProgramRT * & program)3380 int32_t CmKernelRT::GetCmProgram( CmProgramRT* & program )
3381 {
3382     program = m_program;
3383     return CM_SUCCESS;
3384 }
3385 
CollectKernelSurface()3386 int32_t CmKernelRT::CollectKernelSurface()
3387 {
3388     m_vmeSurfaceCount = 0;
3389     m_maxSurfaceIndexAllocated = 0;
3390 
3391     for( uint32_t j = 0; j < m_argCount; j ++ )
3392     {
3393         if ((m_args[ j ].unitKind == ARG_KIND_SURFACE ) || // first time
3394              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_1D ) ||
3395              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D ) ||
3396              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
3397              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
3398              ( m_args[ j ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
3399              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_3D ) ||
3400              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
3401              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
3402              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_VME ) ||
3403              ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
3404              ( m_args[ j ].unitKind == ARG_KIND_STATE_BUFFER ) )
3405         {
3406             int numSurfaces;
3407             int numValidSurfaces = 0;
3408 
3409             if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
3410             {
3411                 numSurfaces = getSurfNumFromArgArraySize(m_args[j].unitSize, m_args[j].unitVmeArraySize);
3412             }
3413             else
3414             {
3415                 numSurfaces = m_args[j].unitSize/sizeof(int);
3416             }
3417 
3418             for (uint32_t k = 0; k < numSurfaces * m_args[j].unitCount; k ++)
3419             {
3420                 uint16_t surfIndex = 0;
3421                 if (m_args[j].surfIndex)
3422                 {
3423                     surfIndex = m_args[j].surfIndex[k];
3424                 }
3425                 if (surfIndex != 0 && surfIndex != CM_NULL_SURFACE)
3426                 {
3427                     m_surfaceArray[surfIndex] = true;
3428                     numValidSurfaces ++;
3429                     m_maxSurfaceIndexAllocated = Max(m_maxSurfaceIndexAllocated, surfIndex);
3430                 }
3431             }
3432             if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
3433             {
3434                 m_vmeSurfaceCount += numValidSurfaces;
3435             }
3436         }
3437 
3438         if (m_args[ j ].isStatelessBuffer)
3439         {
3440             uint32_t surfIndex = m_args[j].index;
3441             m_surfaceArray[surfIndex] = true;
3442         }
3443     }
3444 
3445     for( int32_t i=0; i < CM_GLOBAL_SURFACE_NUMBER; ++i )
3446     {
3447         if( m_globalSurfaces[i] != nullptr )
3448         {
3449             uint32_t surfIndex = m_globalCmIndex[i];
3450             m_surfaceArray[surfIndex] = true;
3451         }
3452     }
3453 
3454     for (int32_t i = 0; i < m_usKernelPayloadSurfaceCount; i++)
3455     {
3456         if (m_pKernelPayloadSurfaceArray[i] != nullptr)
3457         {
3458             uint32_t surfIndex = m_pKernelPayloadSurfaceArray[i]->get_data();
3459             m_surfaceArray[surfIndex] = true;
3460         }
3461     }
3462 
3463     return CM_SUCCESS;
3464 }
3465 
IsKernelDataReusable(CmThreadSpaceRT * threadSpace)3466 int32_t CmKernelRT::IsKernelDataReusable( CmThreadSpaceRT* threadSpace)
3467 {
3468     if(threadSpace)
3469     {
3470         if(threadSpace->IsThreadAssociated() && (threadSpace->GetDirtyStatus()!= CM_THREAD_SPACE_CLEAN))
3471         {
3472             return false;
3473         }
3474     }
3475 
3476     if(m_threadSpace)
3477     {
3478         if(m_threadSpace->GetDirtyStatus()!= CM_THREAD_SPACE_CLEAN)
3479         {
3480             return  false;
3481         }
3482     }
3483 
3484     if(m_dirty !=  CM_KERNEL_DATA_CLEAN)
3485     {
3486         return false;
3487     }
3488 
3489     return true;
3490 }
3491 
3492 //*-----------------------------------------------------------------------------
3493 //| Purpose:    Prepare Kernel Data including thread args, kernel args
3494 //| Returns:    Result of the operation.
3495 //*-----------------------------------------------------------------------------
CreateKernelData(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadSpaceRT * threadSpace)3496 int32_t CmKernelRT::CreateKernelData(
3497     CmKernelData* & kernelData,  // out
3498     uint32_t& kernelDataSize,         // out
3499     const CmThreadSpaceRT* threadSpace )    // in
3500 {
3501     int32_t              hr              = CM_SUCCESS;
3502     PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
3503 
3504     if( (threadSpace != nullptr) && (m_threadSpace != nullptr) )
3505     {
3506         // per-kernel threadspace and per-task threadspace cannot be set at the same time
3507         return CM_INVALID_THREAD_SPACE;
3508     }
3509 
3510     if(m_lastKernelData == nullptr)
3511     {
3512         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3513         CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3514         CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3515     }
3516     else
3517     {
3518         if(IsKernelDataReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
3519         {
3520             // nothing changed; Reuse m_lastKernelData
3521             kernelData = m_lastKernelData;
3522             CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3523             CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3524             kernelDataSize = kernelData->GetKernelDataSize();
3525 
3526             if (m_threadSpace)
3527             {
3528                 halKernelParam = kernelData->GetHalCmKernelData();
3529                 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
3530                 // need to set to clean here because CmThreadSpaceParam.BBdirtyStatus is only set in CreateKernelDataInternal
3531                 // flag used to re-use batch buffer, don't care if BB is busy if it is "clean"
3532                 halKernelParam->kernelThreadSpaceParam.bbDirtyStatus = CM_HAL_BB_CLEAN;
3533             }
3534         }
3535         else
3536         {
3537             if(m_lastKernelData->IsInUse())
3538             { // Need to Create a new one , if the kernel data is in use
3539                 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3540                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3541                 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3542             }
3543             else if(threadSpace && threadSpace->IsThreadAssociated() && (threadSpace->GetDirtyStatus() != CM_THREAD_SPACE_CLEAN))
3544             { // if thread space is assocaited , don't support reuse
3545                 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3546                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3547                 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3548             }
3549             else if(m_dirty < CM_KERNEL_DATA_THREAD_COUNT_DIRTY || // Kernel arg or thread arg dirty
3550                 (m_threadSpace && m_threadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DEPENDENCY_MASK_DIRTY))
3551             {
3552                 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelData(m_lastKernelData,threadSpace));
3553                 kernelData = m_lastKernelData;
3554                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3555                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3556                 kernelDataSize = kernelData->GetKernelDataSize();
3557 
3558             }
3559             else
3560             {
3561                CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3562                CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3563                CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3564             }
3565         }
3566     }
3567 
3568     CleanArgDirtyFlag();
3569     if(threadSpace)
3570     {
3571         threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3572     }
3573     if (m_threadSpace)
3574     {
3575         m_threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3576     }
3577 
3578 finish:
3579     return hr;
3580 }
3581 
GetName()3582 char* CmKernelRT::GetName() { return (char*)m_kernelInfo->kernelName; }
3583 
3584 //*-----------------------------------------------------------------------------
3585 //| Purpose:    Create Kernel Data
3586 //| Returns:    Result of the operation.
3587 //*-----------------------------------------------------------------------------
CreateKernelData(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadGroupSpace * threadGroupSpace)3588 int32_t CmKernelRT::CreateKernelData(
3589     CmKernelData* & kernelData,  // out
3590     uint32_t& kernelDataSize,         // out
3591     const CmThreadGroupSpace* threadGroupSpace )    // in
3592 {
3593     int32_t     hr   = CM_SUCCESS;
3594     CmThreadGroupSpace* usedThreadGroupSpace = nullptr;
3595 
3596     //If kernel has associated TGS, we will use it, instead of per-task TGS
3597     if (m_threadGroupSpace)
3598     {
3599         usedThreadGroupSpace = m_threadGroupSpace;
3600     }
3601     else
3602     {
3603         usedThreadGroupSpace = const_cast<CmThreadGroupSpace*>(threadGroupSpace);
3604     }
3605 
3606     if(m_lastKernelData == nullptr)
3607     {
3608         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, usedThreadGroupSpace));
3609         CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3610         CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3611     }
3612     else
3613     {
3614         if (!((m_dirty & CM_KERNEL_DATA_KERNEL_ARG_DIRTY) || (m_dirty & CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY)))
3615         {
3616             // nothing changed; Reuse m_lastKernelData
3617             kernelData = m_lastKernelData;
3618             CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3619             CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3620             kernelDataSize = kernelData->GetKernelDataSize();
3621         }
3622         else
3623         {
3624             if(m_lastKernelData->IsInUse())
3625             { // Need to Clone a new one
3626                 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, usedThreadGroupSpace));
3627                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3628                 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3629             }
3630             else
3631             {
3632                 // change happend -> Reuse m_lastKernelData but need to change its content accordingly
3633                 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelData(m_lastKernelData, usedThreadGroupSpace));
3634                 kernelData = m_lastKernelData;
3635                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3636                 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3637                 kernelDataSize = kernelData->GetKernelDataSize();
3638             }
3639         }
3640     }
3641 
3642     CleanArgDirtyFlag();
3643 
3644 finish:
3645     return hr;
3646 }
3647 
CleanArgDirtyFlag()3648 int32_t CmKernelRT::CleanArgDirtyFlag()
3649 {
3650 
3651     for(uint32_t i =0 ; i< m_argCount; i++)
3652     {
3653         m_args[i].isDirty = false;
3654     }
3655 
3656     if(m_threadSpace && m_threadSpace->GetDirtyStatus())
3657     {
3658         m_threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3659     }
3660 
3661     m_dirty                 = CM_KERNEL_DATA_CLEAN;
3662 
3663     return CM_SUCCESS;
3664 }
3665 
3666 //*-----------------------------------------------------------------------------
3667 //| Purpose:    Update the global surface and gtpin surface info to kernel data
3668 //| Returns:    Result of the operation.
3669 //*-----------------------------------------------------------------------------
UpdateKernelDataGlobalSurfaceInfo(PCM_HAL_KERNEL_PARAM halKernelParam)3670 int32_t CmKernelRT::UpdateKernelDataGlobalSurfaceInfo( PCM_HAL_KERNEL_PARAM halKernelParam )
3671 {
3672     int32_t hr = CM_SUCCESS;
3673 
3674     //global surface
3675     for ( uint32_t j = 0; j < CM_GLOBAL_SURFACE_NUMBER; j++ )
3676     {
3677         if ( m_globalSurfaces[ j ] != nullptr )
3678         {
3679             halKernelParam->globalSurface[ j ] = m_globalSurfaces[ j ]->get_data();
3680             halKernelParam->globalSurfaceUsed = true;
3681         }
3682         else
3683         {
3684             halKernelParam->globalSurface[ j ] = CM_NULL_SURFACE;
3685         }
3686     }
3687 
3688     for ( uint32_t j = CM_GLOBAL_SURFACE_NUMBER; j < CM_MAX_GLOBAL_SURFACE_NUMBER; j++ )
3689     {
3690         halKernelParam->globalSurface[ j ] = CM_NULL_SURFACE;
3691     }
3692 #if USE_EXTENSION_CODE
3693     UpdateKernelDataGTPinSurfaceInfo(halKernelParam);
3694 #endif
3695 
3696     return hr;
3697 }
3698 
3699 //*-----------------------------------------------------------------------------
3700 //| Purpose:    Prepare Kernel Data including thread args, kernel args
3701 //| Returns:    Result of the operation.
3702 //*-----------------------------------------------------------------------------
CreateKernelDataInternal(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadGroupSpace * threadGroupSpace)3703 int32_t CmKernelRT::CreateKernelDataInternal(
3704     CmKernelData* & kernelData,  // out
3705     uint32_t& kernelDataSize,         // out
3706     const CmThreadGroupSpace* threadGroupSpace)    // in
3707 {
3708     PCM_HAL_KERNEL_PARAM  halKernelParam = nullptr;
3709     int32_t               hr = CM_SUCCESS;
3710     uint32_t              movInstNum = 0;
3711     uint32_t              kernelCurbeSize = 0;
3712     uint32_t              numArgs = 0;
3713     CM_ARG                *tempArgs = nullptr;
3714     uint32_t              argSize = 0;
3715     uint32_t              surfNum = 0; //Pass needed BT entry numbers to HAL CM
3716     CmKernelRT            *cmKernel = nullptr;
3717     uint32_t              minKernelPlayloadOffset = 0;
3718     bool                  adjustLocalIdPayloadOffset = false;
3719 
3720     CM_CHK_CMSTATUS_GOTOFINISH(CmKernelData::Create(this, kernelData));
3721     halKernelParam = kernelData->GetHalCmKernelData();
3722     CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
3723 
3724     //Get Num of args with surface array
3725     CM_CHK_CMSTATUS_GOTOFINISH(GetArgCountPlusSurfArray(argSize, numArgs));
3726 
3727     //Create Temp args
3728     CM_CHK_CMSTATUS_GOTOFINISH(CreateTempArgs(numArgs, tempArgs));
3729 
3730     //Create move instructions
3731     CM_CHK_CMSTATUS_GOTOFINISH(CreateMovInstructions(movInstNum, halKernelParam->movInsData, tempArgs, numArgs));
3732     CM_CHK_CMSTATUS_GOTOFINISH(CalcKernelDataSize(movInstNum, numArgs, argSize, kernelDataSize));
3733     CM_CHK_CMSTATUS_GOTOFINISH(kernelData->SetKernelDataSize(kernelDataSize));
3734 
3735     halKernelParam->clonedKernelParam.isClonedKernel = m_isClonedKernel;
3736     halKernelParam->clonedKernelParam.kernelID       = m_cloneKernelID;
3737     halKernelParam->clonedKernelParam.hasClones      = m_hasClones;
3738 
3739     halKernelParam->kernelId = m_id++;
3740     if ((m_program->m_cisaMajorVersion >= 3 && m_program->m_cisaMinorVersion >= 3))
3741         halKernelParam->numArgs = numArgs;
3742     else
3743         halKernelParam->numArgs = numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM;
3744     halKernelParam->numThreads = m_threadCount;
3745     halKernelParam->kernelBinarySize = m_binarySize + movInstNum * CM_MOVE_INSTRUCTION_SIZE;
3746     halKernelParam->kernelDataSize = kernelDataSize;
3747     halKernelParam->movInsDataSize = movInstNum * CM_MOVE_INSTRUCTION_SIZE;
3748     halKernelParam->kernelDebugEnabled = m_blhwDebugEnable;
3749 
3750     halKernelParam->cmFlags = m_curbeEnabled ? CM_FLAG_CURBE_ENABLED : 0;
3751     halKernelParam->cmFlags |= m_nonstallingScoreboardEnabled ? CM_FLAG_NONSTALLING_SCOREBOARD_ENABLED : 0;
3752 
3753     halKernelParam->kernelBinary = (uint8_t*)m_binary;
3754 
3755     CM_CHK_CMSTATUS_GOTOFINISH(kernelData->GetCmKernel(cmKernel));
3756     if (cmKernel == nullptr)
3757     {
3758         return CM_NULL_POINTER;
3759     }
3760     MOS_SecureStrcpy(halKernelParam->kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE, cmKernel->GetName());
3761 
3762     uint32_t thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth;
3763     threadGroupSpace->GetThreadGroupSpaceSize(thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth);
3764 
3765     for (uint32_t i = 0; i < numArgs; i++)
3766     {
3767         // get the min kernel payload offset
3768         if ((halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE) && IsKernelArg(tempArgs[i]))
3769         {
3770             if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3771             {
3772                 if (minKernelPlayloadOffset == 0 || minKernelPlayloadOffset > tempArgs[i].unitOffsetInPayload)
3773                 {
3774                     minKernelPlayloadOffset = tempArgs[i].unitOffsetInPayload;
3775                 }
3776             }
3777             else
3778             {
3779                 if ((minKernelPlayloadOffset == 0 || minKernelPlayloadOffset > tempArgs[i].unitOffsetInPayload) && (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID))
3780                 {
3781                     minKernelPlayloadOffset = tempArgs[i].unitOffsetInPayload;
3782                 }
3783             }
3784         }
3785     }
3786 
3787     for (uint32_t i = 0; i < numArgs; i++)
3788     {
3789         halKernelParam->argParams[i].unitCount = tempArgs[i].unitCount;
3790         halKernelParam->argParams[i].kind = (CM_HAL_KERNEL_ARG_KIND)(tempArgs[i].unitKind);
3791         halKernelParam->argParams[i].unitSize = tempArgs[i].unitSize;
3792         halKernelParam->argParams[i].payloadOffset = tempArgs[i].unitOffsetInPayload;
3793         halKernelParam->argParams[i].perThread = false;
3794         halKernelParam->argParams[i].nCustomValue = tempArgs[i].nCustomValue;
3795         halKernelParam->argParams[i].aliasIndex = tempArgs[i].aliasIndex;
3796         halKernelParam->argParams[i].aliasCreated = tempArgs[i].aliasCreated;
3797         halKernelParam->argParams[i].isNull = tempArgs[i].isNull;
3798 
3799         if (tempArgs[i].unitKind == CM_ARGUMENT_IMPLICT_LOCALSIZE) {
3800             CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3801             *(uint32_t *)halKernelParam->argParams[i].firstValue = thrdSpaceWidth;
3802             *(uint32_t *)(halKernelParam->argParams[i].firstValue + 4) = thrdSpaceHeight;
3803             *(uint32_t *)(halKernelParam->argParams[i].firstValue + 8) = thrdSpaceDepth;
3804         }
3805         else if (tempArgs[i].unitKind == CM_ARGUMENT_IMPLICT_GROUPSIZE) {
3806             CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3807             *(uint32_t *)halKernelParam->argParams[i].firstValue = grpSpaceWidth;
3808             *(uint32_t *)(halKernelParam->argParams[i].firstValue + 4) = grpSpaceHeight;
3809             *(uint32_t *)(halKernelParam->argParams[i].firstValue + 8) = grpSpaceDepth;
3810         }
3811         else if (tempArgs[i].unitKind == ARG_KIND_IMPLICIT_LOCALID) {
3812             CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3813             halKernelParam->localIdIndex = i;
3814         }
3815         else
3816             CreateThreadArgData(&halKernelParam->argParams[i], i, nullptr, tempArgs);
3817 
3818         if (halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE)
3819         {
3820             if (IsKernelArg(halKernelParam->argParams[i]))
3821             {
3822                 // Kernel arg : calculate curbe size & adjust payloadoffset
3823                 if (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID)
3824                 {
3825                     halKernelParam->argParams[i].payloadOffset -= minKernelPlayloadOffset;
3826                 }
3827                 else
3828                 {
3829                     // ARG_KIND_IMPLICIT_LOCALID is only for visa3.3+, need to adjust payloadOffset of local id for visa3.3+ later.
3830                     adjustLocalIdPayloadOffset = true;
3831                 }
3832 
3833                 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3834                     if ((halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize > kernelCurbeSize))
3835                     {  // The largest one
3836                         kernelCurbeSize = halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize;
3837                     }
3838                 }
3839                 else
3840                 {
3841                     if ((halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize > kernelCurbeSize) && (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID))
3842                     {  // The largest one
3843                         kernelCurbeSize = halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize;
3844                     }
3845                 }
3846             }
3847         }
3848     }
3849 
3850     if ( m_stateBufferBounded != CM_STATE_BUFFER_NONE )
3851     {
3852         PCM_CONTEXT_DATA cmData = ( PCM_CONTEXT_DATA )m_device->GetAccelData();
3853         PCM_HAL_STATE state = cmData->cmHalState;
3854         kernelCurbeSize = state->pfnGetStateBufferSizeForKernel( state, this );
3855         halKernelParam->stateBufferType = state->pfnGetStateBufferTypeForKernel( state, this );
3856     }
3857 
3858     if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3859     {
3860         // GPGPU walker - implicit args
3861         for (uint32_t i = numArgs; i < numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM; i++)
3862         {
3863             halKernelParam->argParams[i].unitCount = 1;
3864             halKernelParam->argParams[i].kind = CM_ARGUMENT_GENERAL;
3865             halKernelParam->argParams[i].unitSize = 4;
3866             halKernelParam->argParams[i].payloadOffset = MOS_ALIGN_CEIL(kernelCurbeSize, 4) + (i - numArgs) * sizeof(uint32_t);
3867             halKernelParam->argParams[i].perThread = false;
3868         }
3869 
3870         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 0].firstValue, thrdSpaceWidth));
3871         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 1].firstValue, thrdSpaceHeight));
3872         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 2].firstValue, grpSpaceWidth));
3873         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 3].firstValue, grpSpaceHeight));
3874         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 4].firstValue, thrdSpaceWidth));
3875         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 5].firstValue, thrdSpaceHeight));
3876         halKernelParam->localIdIndex = halKernelParam->numArgs - 2;
3877     }
3878     halKernelParam->gpgpuWalkerParams.gpgpuEnabled = true;
3879     halKernelParam->gpgpuWalkerParams.groupWidth = grpSpaceWidth;
3880     halKernelParam->gpgpuWalkerParams.groupHeight = grpSpaceHeight;
3881     halKernelParam->gpgpuWalkerParams.groupDepth = grpSpaceDepth;
3882     halKernelParam->gpgpuWalkerParams.threadHeight = thrdSpaceHeight;
3883     halKernelParam->gpgpuWalkerParams.threadWidth = thrdSpaceWidth;
3884     halKernelParam->gpgpuWalkerParams.threadDepth = thrdSpaceDepth;
3885     //Get SLM size
3886     halKernelParam->slmSize = GetSLMSize();
3887 
3888     //Get spill area to adjust scratch space
3889     halKernelParam->spillSize = GetSpillMemUsed();
3890 
3891     //Set Barrier mode
3892     halKernelParam->barrierMode = m_barrierMode;
3893     halKernelParam->numberThreadsInGroup = thrdSpaceWidth * thrdSpaceHeight * thrdSpaceDepth;
3894     if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3895         kernelCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 4) + CM_GPUWALKER_IMPLICIT_ARG_NUM * sizeof(uint32_t);
3896     else
3897         kernelCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 4);
3898     if ((kernelCurbeSize % 32) == 4) //The per-thread data occupy 2 GRF.
3899     {
3900         halKernelParam->curbeSizePerThread = 64;
3901     }
3902     else
3903     {
3904         halKernelParam->curbeSizePerThread = 32;
3905     }
3906     if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3907         halKernelParam->totalCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 32) - halKernelParam->curbeSizePerThread + halKernelParam->curbeSizePerThread *
3908             thrdSpaceWidth * thrdSpaceHeight;
3909         //Since the CURBE is 32 bytes alignment, for GPGPU walker without the user specified thread argument, implicit per-thread id arguments will occupy at most 32 bytes
3910         halKernelParam->crossThreadConstDataLen = MOS_ALIGN_CEIL(kernelCurbeSize, 32) - halKernelParam->curbeSizePerThread;
3911     }
3912     else {
3913         halKernelParam->totalCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 32) + halKernelParam->curbeSizePerThread *
3914             thrdSpaceWidth * thrdSpaceHeight * thrdSpaceDepth;
3915         //Since the CURBE is 32 bytes alignment, for GPGPU walker without the user specified thread argument, implicit per-thread id arguments will occupy at most 32 bytes
3916         halKernelParam->crossThreadConstDataLen = MOS_ALIGN_CEIL(kernelCurbeSize, 32);
3917     }
3918     halKernelParam->payloadSize = 0; // no thread arg allowed
3919 
3920     // adjust payloadOffset of local id for visa3.3+
3921     if (adjustLocalIdPayloadOffset)
3922     {
3923         halKernelParam->argParams[halKernelParam->localIdIndex].payloadOffset = halKernelParam->crossThreadConstDataLen;
3924     }
3925 
3926     m_sizeInCurbe = GetAlignedCurbeSize(halKernelParam->totalCurbeSize);
3927 
3928     CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelIndirectData(&halKernelParam->indirectDataParam));
3929 
3930     if (m_samplerBtiCount != 0)
3931     {
3932         CmSafeMemCopy((void*)halKernelParam->samplerBTIParam.samplerInfo, (void*)m_samplerBtiEntry, sizeof(m_samplerBtiEntry));
3933         halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
3934 
3935         CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
3936         m_samplerBtiCount = 0;
3937     }
3938 
3939     CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces);
3940 
3941     UpdateKernelDataGlobalSurfaceInfo(halKernelParam);
3942 
3943     //Destroy Temp Args
3944     for (uint32_t j = 0; j < numArgs; j++)
3945     {
3946         if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
3947         {
3948             MosSafeDeleteArray(tempArgs[j].value);
3949         }
3950     }
3951     MosSafeDeleteArray(tempArgs);
3952 
3953     CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
3954 finish:
3955     if (hr != CM_SUCCESS)
3956     {
3957         //Clean allocated memory : need to count the implicit args
3958         if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3959 
3960             for (uint32_t i = 0; i < numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM; i++)
3961             {
3962                 if (halKernelParam)
3963                 {
3964                     if (halKernelParam->argParams[i].firstValue)
3965                     {
3966                         MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
3967                     }
3968                 }
3969             }
3970         }
3971         else
3972         {
3973             for (uint32_t i = 0; i < numArgs; i++)
3974             {
3975                 if (halKernelParam)
3976                 {
3977                     if (halKernelParam->argParams[i].firstValue)
3978                     {
3979                         MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
3980                     }
3981                 }
3982             }
3983         }
3984         //Destroy Temp Args in failing case
3985         if (tempArgs)
3986         {
3987             for (uint32_t j = 0; j < numArgs; j++)
3988             {
3989                 if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
3990                 {
3991                     MosSafeDeleteArray(tempArgs[j].value);
3992                 }
3993             }
3994             MosSafeDeleteArray(tempArgs);
3995         }
3996     }
3997     return hr;
3998 }
3999 
4000 //*-----------------------------------------------------------------------------
4001 //| Purpose:    Prepare Kernel Data including thread args, kernel args
4002 //| Returns:    Result of the operation.
4003 //*-----------------------------------------------------------------------------
IsBatchBufferReusable(CmThreadSpaceRT * taskThreadSpace)4004 bool CmKernelRT::IsBatchBufferReusable( CmThreadSpaceRT * taskThreadSpace )
4005 {
4006     bool reusable = true;
4007     //Update m_id if the batch buffer is not reusable.
4008     if (m_dirty & CM_KERNEL_DATA_THREAD_ARG_DIRTY)
4009     {
4010         reusable = false; // if thread arg dirty
4011     }
4012     else if ((m_dirty & CM_KERNEL_DATA_KERNEL_ARG_DIRTY) && (m_curbeEnabled == false))
4013     {
4014         reusable = false; // if kernel arg dirty and curbe disabled
4015     }
4016     else if (m_dirty & CM_KERNEL_DATA_THREAD_COUNT_DIRTY)
4017     {
4018         reusable = false; // if thread count dirty
4019     }
4020     else if (m_threadSpace)
4021     {
4022        if (m_threadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DATA_DIRTY)
4023        {
4024           reusable = false; // if per kernel thread space exists and it is completely dirty
4025        }
4026     }
4027     else if (taskThreadSpace)
4028     {
4029        if (taskThreadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DATA_DIRTY)
4030        {
4031           reusable = false; // if per task thread space change and it is completely dirty
4032        }
4033     }
4034     return reusable;
4035 
4036 }
4037 
4038 //*-----------------------------------------------------------------------------
4039 //| Purpose:    Checks to see if kernel prologue has changed
4040 //| Returns:    Result of the operation.
4041 //*-----------------------------------------------------------------------------
IsPrologueDirty(void)4042 bool CmKernelRT::IsPrologueDirty( void )
4043 {
4044     bool prologueDirty = false;
4045 
4046     if( m_threadCount != m_lastThreadCount )
4047     {
4048         if( m_lastThreadCount )
4049         {
4050             if( m_threadCount == 1 || m_lastThreadCount == 1 )
4051             {
4052                 prologueDirty = true;
4053             }
4054         }
4055         m_lastThreadCount = m_threadCount;
4056     }
4057 
4058     if( m_adjustScoreboardY != m_lastAdjustScoreboardY )
4059     {
4060         if( m_lastAdjustScoreboardY )
4061         {
4062             prologueDirty = true;
4063         }
4064         m_lastAdjustScoreboardY = m_adjustScoreboardY;
4065     }
4066 
4067     return prologueDirty;
4068 }
4069 
4070 //*-----------------------------------------------------------------------------
4071 //| Purpose:    Prepare Kernel Data including thread args, kernel args
4072 //| Returns:    Result of the operation.
4073 //*-----------------------------------------------------------------------------
CreateKernelDataInternal(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadSpaceRT * threadSpace)4074 int32_t CmKernelRT::CreateKernelDataInternal(
4075     CmKernelData* & kernelData,  // out
4076     uint32_t& kernelDataSize,         // out
4077     const CmThreadSpaceRT* threadSpace )    // in
4078 {
4079     PCM_HAL_KERNEL_PARAM  halKernelParam       = nullptr;
4080     int32_t               hr                    = CM_SUCCESS;
4081     uint32_t              movInstNum            = 0;
4082     uint32_t              kernelCurbeSize          = 0;
4083     uint32_t              numArgs               = 0;
4084     uint32_t              bottomRange         = 1024;
4085     uint32_t              upRange             = 0;
4086     uint32_t              unitSize              = 0;
4087     bool                  hasThreadArg          = false;
4088     CmThreadSpaceRT         *cmThreadSpace       = nullptr;
4089     bool                  isKernelThreadSpace   = false;
4090     CM_ARG                *tempArgs            = nullptr;
4091     uint32_t              argSize               = 0;
4092     uint32_t              surfNum               = 0; //Pass needed BT entry numbers to HAL CM
4093     CmKernelRT             *cmKernel             = nullptr;
4094 
4095     if( threadSpace == nullptr && m_threadSpace!= nullptr)
4096     {
4097         cmThreadSpace = m_threadSpace;
4098         isKernelThreadSpace = true;
4099     }
4100     else
4101     {
4102         cmThreadSpace = const_cast<CmThreadSpaceRT*>(threadSpace);
4103     }
4104 
4105     CM_CHK_CMSTATUS_GOTOFINISH(CmKernelData::Create( this, kernelData ));
4106     halKernelParam = kernelData->GetHalCmKernelData();
4107     CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4108 
4109     //Get Num of args with surface array
4110     CM_CHK_CMSTATUS_GOTOFINISH(GetArgCountPlusSurfArray(argSize, numArgs));
4111 
4112     if( numArgs > 0)
4113     {
4114         //Create Temp args
4115         CM_CHK_CMSTATUS_GOTOFINISH(CreateTempArgs(numArgs, tempArgs));
4116         //Create move instructions
4117         CM_CHK_CMSTATUS_GOTOFINISH(CreateMovInstructions(movInstNum,   halKernelParam->movInsData, tempArgs, numArgs));
4118     }
4119 
4120     CM_CHK_CMSTATUS_GOTOFINISH(CalcKernelDataSize(movInstNum, numArgs, argSize, kernelDataSize));
4121     CM_CHK_CMSTATUS_GOTOFINISH(kernelData->SetKernelDataSize(kernelDataSize));
4122 
4123     if(!IsBatchBufferReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
4124     {
4125         m_id ++;
4126     }
4127 
4128     if( IsPrologueDirty( ) )
4129     {
4130         // can't re-use kernel binary in GSH
4131         // just update upper 16 bits
4132         uint64_t tempID = m_id;
4133         tempID >>= 48;
4134         tempID++;
4135         tempID <<= 48;
4136         // get rid of old values in upper 16 bits
4137         m_id <<= 16;
4138         m_id >>= 16;
4139         m_id |= tempID;
4140     }
4141 
4142     halKernelParam->clonedKernelParam.isClonedKernel = m_isClonedKernel;
4143     halKernelParam->clonedKernelParam.kernelID       = m_cloneKernelID;
4144     halKernelParam->clonedKernelParam.hasClones      = m_hasClones;
4145     halKernelParam->kernelId           = m_id; // kernel id , high 32-bit is kernel id, low 32-bit is kernel data id for batch buffer reuse
4146     halKernelParam->numArgs             = numArgs;
4147     halKernelParam->numThreads          = m_threadCount;
4148     halKernelParam->kernelBinarySize    = m_binarySize + movInstNum * CM_MOVE_INSTRUCTION_SIZE;
4149     halKernelParam->kernelDataSize      = kernelDataSize;
4150     halKernelParam->movInsDataSize      = movInstNum * CM_MOVE_INSTRUCTION_SIZE;
4151 
4152     halKernelParam->cmFlags             = m_curbeEnabled ? CM_FLAG_CURBE_ENABLED : 0;
4153     halKernelParam->cmFlags            |= m_nonstallingScoreboardEnabled ? CM_FLAG_NONSTALLING_SCOREBOARD_ENABLED : 0;
4154     halKernelParam->kernelDebugEnabled  = m_blhwDebugEnable;
4155 
4156     halKernelParam->kernelBinary        = (uint8_t*)m_binary;
4157 
4158     CM_CHK_CMSTATUS_GOTOFINISH( kernelData->GetCmKernel( cmKernel ) );
4159     if ( cmKernel == nullptr )
4160     {
4161         return CM_NULL_POINTER;
4162     }
4163     MOS_SecureStrcpy( halKernelParam->kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE, cmKernel->GetName() );
4164 
4165     if ( cmThreadSpace )
4166     {// either from per kernel thread space or per task thread space
4167         CM_CHK_CMSTATUS_GOTOFINISH(SortThreadSpace(cmThreadSpace)); // must be called before CreateThreadArgData
4168     }
4169 
4170     for(uint32_t i =0 ; i< numArgs; i++)
4171     {
4172         halKernelParam->argParams[i].unitCount        = tempArgs[ i ].unitCount;
4173         halKernelParam->argParams[i].kind              = (CM_HAL_KERNEL_ARG_KIND)(tempArgs[ i ].unitKind);
4174         halKernelParam->argParams[i].unitSize         = tempArgs[ i ].unitSize;
4175         halKernelParam->argParams[i].payloadOffset    = tempArgs[ i ].unitOffsetInPayload;
4176         halKernelParam->argParams[i].perThread        = (tempArgs[ i ].unitCount > 1) ? true :false;
4177         halKernelParam->argParams[i].nCustomValue      = tempArgs[ i ].nCustomValue;
4178         halKernelParam->argParams[i].aliasIndex       = tempArgs[ i ].aliasIndex;
4179         halKernelParam->argParams[i].aliasCreated     = tempArgs[ i ].aliasCreated;
4180         halKernelParam->argParams[i].isNull           = tempArgs[ i ].isNull;
4181 
4182         CreateThreadArgData(&halKernelParam->argParams[i], i, cmThreadSpace, tempArgs);
4183 
4184         if(CHECK_SURFACE_TYPE ( halKernelParam->argParams[i].kind,
4185             ARG_KIND_SURFACE_VME,
4186             ARG_KIND_SURFACE_SAMPLER,
4187             ARG_KIND_SURFACE2DUP_SAMPLER))
4188         {
4189             unitSize = CM_ARGUMENT_SURFACE_SIZE;
4190         }
4191         else
4192         {
4193             unitSize = halKernelParam->argParams[i].unitSize;
4194         }
4195 
4196         if (halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE)
4197         {
4198             if(IsKernelArg(halKernelParam->argParams[i]))
4199             {
4200                 // Kernel arg : calculate curbe size & adjust payloadoffset
4201                 // Note: Here the payloadOffset may be different from original value
4202                 uint32_t offset = halKernelParam->argParams[i].payloadOffset - CM_PAYLOAD_OFFSET;
4203                 if (offset >= kernelCurbeSize)
4204                 {
4205                     kernelCurbeSize = offset + unitSize;
4206                 }
4207                 halKernelParam->argParams[i].payloadOffset -= CM_PAYLOAD_OFFSET;
4208             }
4209         }
4210 
4211         if(!IsKernelArg(halKernelParam->argParams[i]))
4212         {   //Thread arg : Calculate payload size & adjust payloadoffset
4213             hasThreadArg  = true;
4214             halKernelParam->argParams[i].payloadOffset -= CM_PAYLOAD_OFFSET;
4215 
4216             if(halKernelParam->argParams[i].payloadOffset < bottomRange)
4217             {
4218                bottomRange = halKernelParam->argParams[i].payloadOffset;
4219             }
4220             if(halKernelParam->argParams[i].payloadOffset >=  upRange)
4221             {
4222                upRange = halKernelParam->argParams[i].payloadOffset + unitSize;
4223             }
4224         }
4225     }
4226 
4227     if ( m_stateBufferBounded != CM_STATE_BUFFER_NONE )
4228     {
4229         PCM_CONTEXT_DATA cmData = ( PCM_CONTEXT_DATA )m_device->GetAccelData();
4230         PCM_HAL_STATE state = cmData->cmHalState;
4231         kernelCurbeSize = state->pfnGetStateBufferSizeForKernel( state, this );
4232         halKernelParam->stateBufferType = state->pfnGetStateBufferTypeForKernel( state, this );
4233     }
4234 
4235     halKernelParam->payloadSize         = hasThreadArg ? MOS_ALIGN_CEIL(upRange -  bottomRange, 4): 0;
4236     halKernelParam->totalCurbeSize      = MOS_ALIGN_CEIL(kernelCurbeSize, 32);
4237     halKernelParam->curbeSizePerThread  = halKernelParam->totalCurbeSize;
4238 
4239     halKernelParam->perThreadArgExisted = hasThreadArg;
4240 
4241     m_sizeInCurbe = GetAlignedCurbeSize( kernelCurbeSize );
4242 
4243     if ( halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE )
4244     {
4245         for(uint32_t i=0; i< numArgs; i++)
4246         {
4247             if(!IsKernelArg(halKernelParam->argParams[i]))
4248             {  // thread arg: need to minus curbe size
4249                 halKernelParam->argParams[i].payloadOffset -= halKernelParam->curbeSizePerThread;
4250             }
4251         }
4252     }
4253 
4254     //Create indirect data
4255     CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelIndirectData(&halKernelParam->indirectDataParam));
4256 
4257     if ( m_samplerBtiCount != 0 )
4258     {
4259         CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4260         halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4261 
4262         CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4263         m_samplerBtiCount = 0;
4264     }
4265 
4266     CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces);
4267 
4268     //Create thread space param: only avaliable if per kernel ts exists
4269     if(m_threadSpace)
4270     {
4271         CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadSpaceParam(&halKernelParam->kernelThreadSpaceParam, m_threadSpace));
4272     }
4273 
4274     //Get SLM size
4275     halKernelParam->slmSize = GetSLMSize();
4276 
4277     //Get Spill mem used
4278     halKernelParam->spillSize = GetSpillMemUsed();
4279 
4280     //Set Barrier mode
4281     halKernelParam->barrierMode = m_barrierMode;
4282 
4283     CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4284 
4285     //Destroy Temp Args
4286     for (uint32_t j = 0; j < numArgs; j++)
4287     {
4288         if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
4289         {
4290             MosSafeDeleteArray(tempArgs[j].value);
4291         }
4292     }
4293     MosSafeDeleteArray( tempArgs );
4294 
4295     CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4296 finish:
4297     if(hr != CM_SUCCESS)
4298     {
4299          if(halKernelParam)
4300          {
4301              //Clean allocated memory
4302              for(uint32_t i =0 ; i< numArgs; i++)
4303              {
4304                 if( halKernelParam->argParams[i].firstValue )
4305                 {
4306                     MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
4307                 }
4308              }
4309          }
4310 
4311          //Destroy Temp Args
4312          if (tempArgs)
4313          {
4314              for (uint32_t j = 0; j < numArgs; j++)
4315              {
4316                  if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
4317                  {
4318                      MosSafeDeleteArray(tempArgs[j].value);
4319                  }
4320              }
4321              MosSafeDeleteArray(tempArgs);
4322          }
4323     }
4324     return hr;
4325 }
4326 
4327 //*-----------------------------------------------------------------------------
4328 //| Purpose:    Update kernel data's kernel arg, thread arg, thread count
4329 //| Returns:    Result of the operation.
4330 //*-----------------------------------------------------------------------------
UpdateKernelData(CmKernelData * kernelData,const CmThreadSpaceRT * threadSpace)4331 int32_t CmKernelRT::UpdateKernelData(
4332     CmKernelData*   kernelData,  // in
4333     const CmThreadSpaceRT* threadSpace)
4334 {
4335     int32_t               hr                      = CM_SUCCESS;
4336     PCM_HAL_KERNEL_PARAM  halKernelParam         = nullptr;
4337     bool                  bbResuable             = true;
4338     CmThreadSpaceRT         *cmThreadSpace         = nullptr;
4339     bool                  isKernelThreadSpace     = false;
4340     uint32_t              argIndexStep            = 0;
4341     uint32_t              argIndex                = 0;
4342     uint32_t              surfNum                 = 0; //Update Number of surface used by kernel
4343 
4344     if( threadSpace == nullptr && m_threadSpace!= nullptr)
4345     {
4346         cmThreadSpace = m_threadSpace;
4347         isKernelThreadSpace = true;
4348     }
4349     else
4350     {
4351         cmThreadSpace = const_cast<CmThreadSpaceRT*>(threadSpace);
4352     }
4353 
4354     CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
4355     CM_ASSERT(kernelData->IsInUse() == false);
4356 
4357     halKernelParam = kernelData->GetHalCmKernelData();
4358     CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4359 
4360     if(!IsBatchBufferReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
4361     {
4362         m_id ++;
4363         halKernelParam->kernelId = m_id;
4364     }
4365 
4366     //Update arguments
4367     for(uint32_t orgArgIndex =0 ; orgArgIndex< m_argCount; orgArgIndex++)
4368     {
4369         argIndexStep = 1;
4370 
4371         if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4372                         ARG_KIND_SURFACE,
4373                         ARG_KIND_SURFACE_1D,
4374                         ARG_KIND_SURFACE_2D,
4375                         ARG_KIND_SURFACE_2D_UP,
4376                         ARG_KIND_SURFACE_SAMPLER,
4377                         ARG_KIND_SURFACE2DUP_SAMPLER,
4378                         ARG_KIND_SURFACE_3D,
4379                         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4380                         ARG_KIND_SURFACE_SAMPLER8X8_VA,
4381                         ARG_KIND_SURFACE_2D_SCOREBOARD,
4382                         ARG_KIND_STATE_BUFFER ) )
4383         {
4384             argIndexStep = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array exists
4385         }
4386         else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind,  ARG_KIND_SURFACE_VME))
4387         {
4388             argIndexStep = m_args[orgArgIndex].unitVmeArraySize;
4389         }
4390 
4391         if(m_args[ orgArgIndex ].isDirty)
4392         {
4393             if(m_args[ orgArgIndex ].unitCount > 1)
4394             { // thread arg is dirty
4395                 bbResuable          = false;
4396             }
4397 
4398             if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4399                         ARG_KIND_SURFACE,
4400                         ARG_KIND_SURFACE_1D,
4401                         ARG_KIND_SURFACE_2D,
4402                         ARG_KIND_SURFACE_2D_UP,
4403                         ARG_KIND_SURFACE_SAMPLER,
4404                         ARG_KIND_SURFACE2DUP_SAMPLER,
4405                         ARG_KIND_SURFACE_3D,
4406                         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4407                         ARG_KIND_SURFACE_SAMPLER8X8_VA,
4408                         ARG_KIND_SURFACE_2D_SCOREBOARD,
4409                         ARG_KIND_STATE_BUFFER ) )
4410             {  // for surface args
4411 
4412                 uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4413                 if(m_args[ orgArgIndex ].unitCount ==  1) // kernel arg
4414                 {
4415                     if (numSurfaces > 1)
4416                     {
4417                         for (uint32_t kk = 0; kk < numSurfaces; kk++)
4418                         {
4419                             CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4420                             CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4421                                 m_args[orgArgIndex].value + kk*sizeof(uint32_t), sizeof(uint32_t));
4422                             halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4423                             halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4424                             halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4425 
4426                             if (!m_args[orgArgIndex].surfIndex[kk])
4427                             {
4428                                 //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
4429                                 //This is for special usage if there is empty element in surface array.
4430                                 halKernelParam->argParams[argIndex + kk].kind = CM_ARGUMENT_SURFACE2D;
4431                                 continue;
4432                             }
4433 
4434                             halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].surfArrayArg[kk].argKindForArray;
4435                             halKernelParam->argParams[argIndex + kk].nCustomValue = m_args[orgArgIndex].surfArrayArg[kk].addressModeForArray;
4436                         }
4437                     }
4438                     else
4439                     {
4440                         CM_ASSERT(halKernelParam->argParams[argIndex].firstValue != nullptr);
4441                         CmSafeMemCopy(halKernelParam->argParams[argIndex].firstValue,
4442                                 m_args[ orgArgIndex ].value, sizeof(uint32_t));
4443                         halKernelParam->argParams[argIndex].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[ orgArgIndex ].unitKind;
4444                         halKernelParam->argParams[argIndex].aliasIndex   = m_args[orgArgIndex].aliasIndex;
4445                         halKernelParam->argParams[argIndex].aliasCreated = m_args[orgArgIndex].aliasCreated;
4446                         halKernelParam->argParams[argIndex].isNull = m_args[orgArgIndex].isNull;
4447                     }
4448 
4449                  }
4450                  else // thread arg
4451                  {
4452                     uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4453                     uint32_t *surfaces = (uint32_t *)MOS_NewArray(uint8_t, (sizeof(uint32_t) * m_args[orgArgIndex].unitCount));
4454                     CM_CHK_NULL_GOTOFINISH(surfaces, CM_OUT_OF_HOST_MEMORY);
4455                     for (uint32_t kk=0;  kk< numSurfaces ; kk++)
4456                     {
4457                         for (uint32_t s = 0; s < m_args[orgArgIndex].unitCount; s++)
4458                         {
4459                             surfaces[s] = *(uint32_t *)((uint32_t *)m_args[orgArgIndex].value + kk + numSurfaces * s);
4460                         }
4461                         CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4462                             surfaces, sizeof(uint32_t) * m_args[orgArgIndex].unitCount);
4463 
4464                         halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[ orgArgIndex ].unitKind;
4465 
4466                         halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4467                         halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4468                         halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4469 
4470                     }
4471                     MosSafeDeleteArray(surfaces);
4472                  }
4473 
4474             }
4475             else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4476             {
4477                 uint32_t numSurfaces = m_args[orgArgIndex].unitVmeArraySize;
4478                 if (m_args[orgArgIndex].unitCount == 1) // kernel arg
4479                 {
4480                     uint32_t vmeSurfOffset = 0;
4481                     for (uint32_t kk = 0; kk< numSurfaces; kk++)
4482                     {
4483                         uint16_t vmeSize = (uint16_t)getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[orgArgIndex].value + vmeSurfOffset));
4484 
4485                         // reallocate the firstValue for VME surface every time
4486                         // since the number of surfaces may vary
4487                         MosSafeDeleteArray(halKernelParam->argParams[argIndex + kk].firstValue);
4488                         halKernelParam->argParams[argIndex + kk].firstValue = MOS_NewArray(uint8_t, vmeSize);
4489                         CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4490                         CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4491                             m_args[orgArgIndex].value + vmeSurfOffset, vmeSize);
4492 
4493                         halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4494 
4495                         halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4496                         halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4497                         halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4498                         halKernelParam->argParams[argIndex + kk].unitSize = vmeSize;
4499                         vmeSurfOffset += vmeSize;
4500                     }
4501                 }
4502             }
4503             else
4504             {
4505                 CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadArgData(&halKernelParam->argParams[argIndex ], orgArgIndex, cmThreadSpace, m_args));
4506             }
4507         }
4508         argIndex += argIndexStep;
4509     }
4510 
4511     //Update Thread space param
4512     if(m_threadSpace && m_threadSpace->GetDirtyStatus())
4513     {
4514 
4515         CM_CHK_CMSTATUS_GOTOFINISH(SortThreadSpace(m_threadSpace));
4516 
4517         uint32_t threadSpaceWidth = 0, threadSpaceHeight = 0;
4518         PCM_HAL_KERNEL_THREADSPACE_PARAM  cmKernelThreadSpaceParam = &halKernelParam->kernelThreadSpaceParam;
4519         m_threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
4520 
4521         cmKernelThreadSpaceParam->threadSpaceWidth  = (uint16_t)threadSpaceWidth;
4522         cmKernelThreadSpaceParam->threadSpaceHeight = (uint16_t)threadSpaceHeight;
4523         m_threadSpace->GetDependencyPatternType(cmKernelThreadSpaceParam->patternType);
4524         m_threadSpace->GetWalkingPattern(cmKernelThreadSpaceParam->walkingPattern);
4525         m_threadSpace->GetColorCountMinusOne(cmKernelThreadSpaceParam->colorCountMinusOne);
4526 
4527         CM_HAL_DEPENDENCY*     dependency = nullptr;
4528         m_threadSpace->GetDependency( dependency);
4529 
4530         if(dependency != nullptr)
4531         {
4532             CmSafeMemCopy(&cmKernelThreadSpaceParam->dependencyInfo, dependency, sizeof(CM_HAL_DEPENDENCY));
4533         }
4534 
4535         if( m_threadSpace->CheckWalkingParametersSet() )
4536         {
4537             CM_CHK_CMSTATUS_GOTOFINISH(m_threadSpace->GetWalkingParameters(cmKernelThreadSpaceParam->walkingParams));
4538         }
4539 
4540         if( m_threadSpace->CheckDependencyVectorsSet() )
4541         {
4542             CM_CHK_CMSTATUS_GOTOFINISH(m_threadSpace->GetDependencyVectors(cmKernelThreadSpaceParam->dependencyVectors));
4543         }
4544 
4545         if(m_threadSpace->IsThreadAssociated())
4546         {// media object only
4547             uint32_t *boardOrder = nullptr;
4548             m_threadSpace->GetBoardOrder(boardOrder);
4549             CM_CHK_NULL_GOTOFINISH_CMERROR(boardOrder);
4550 
4551             CM_THREAD_SPACE_UNIT *threadSpaceUnit = nullptr;
4552             m_threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
4553             CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpaceUnit);
4554 
4555             cmKernelThreadSpaceParam->reuseBBUpdateMask = 0;
4556             for(uint32_t i=0; i< threadSpaceWidth * threadSpaceHeight ; i++)
4557             {
4558                 cmKernelThreadSpaceParam->threadCoordinates[i].x = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.x;
4559                 cmKernelThreadSpaceParam->threadCoordinates[i].y = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.y;
4560                 cmKernelThreadSpaceParam->threadCoordinates[i].mask = threadSpaceUnit[boardOrder[i]].dependencyMask;
4561                 cmKernelThreadSpaceParam->threadCoordinates[i].resetMask = threadSpaceUnit[boardOrder[i]].reset;
4562                 cmKernelThreadSpaceParam->threadCoordinates[i].color = threadSpaceUnit[boardOrder[i]].scoreboardColor;
4563                 cmKernelThreadSpaceParam->threadCoordinates[i].sliceSelect = threadSpaceUnit[boardOrder[i]].sliceDestinationSelect;
4564                 cmKernelThreadSpaceParam->threadCoordinates[i].subSliceSelect = threadSpaceUnit[boardOrder[i]].subSliceDestinationSelect;
4565                 cmKernelThreadSpaceParam->reuseBBUpdateMask |= threadSpaceUnit[boardOrder[i]].reset;
4566             }
4567 
4568             if( cmKernelThreadSpaceParam->patternType == CM_WAVEFRONT26Z )
4569             {
4570                 CM_HAL_WAVEFRONT26Z_DISPATCH_INFO dispatchInfo;
4571                 m_threadSpace->GetWavefront26ZDispatchInfo(dispatchInfo);
4572 
4573                 if (cmKernelThreadSpaceParam->dispatchInfo.numWaves >= dispatchInfo.numWaves)
4574                 {
4575                     cmKernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
4576                     CmSafeMemCopy(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
4577                 }
4578                 else
4579                 {
4580                     cmKernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
4581                     MosSafeDeleteArray(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave);
4582                     cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave = MOS_NewArray(uint32_t, dispatchInfo.numWaves);
4583                     CM_CHK_NULL_GOTOFINISH(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, CM_OUT_OF_HOST_MEMORY);
4584                     CmSafeMemCopy(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
4585                 }
4586             }
4587         }
4588     }
4589 
4590     // Update indirect data
4591     if( m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY)
4592     {
4593         halKernelParam->indirectDataParam.indirectDataSize = m_usKernelPayloadDataSize;
4594         halKernelParam->indirectDataParam.surfaceCount     = m_usKernelPayloadSurfaceCount;
4595 
4596         if(m_usKernelPayloadDataSize != 0)
4597         {
4598             if(m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY)
4599             { // size change, need to reallocate
4600                 MosSafeDeleteArray(halKernelParam->indirectDataParam.indirectData);
4601                 halKernelParam->indirectDataParam.indirectData = MOS_NewArray(uint8_t, m_usKernelPayloadDataSize);
4602                 CM_CHK_NULL_GOTOFINISH(halKernelParam->indirectDataParam.indirectData, CM_OUT_OF_HOST_MEMORY);
4603             }
4604             CmSafeMemCopy(halKernelParam->indirectDataParam.indirectData, (void *)m_kernelPayloadData, m_usKernelPayloadDataSize);
4605         }
4606 
4607         if(m_usKernelPayloadSurfaceCount != 0)
4608         {
4609             if(m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY)
4610             { // size change, need to reallocate
4611                 MosSafeDeleteArray(halKernelParam->indirectDataParam.surfaceInfo);
4612                 halKernelParam->indirectDataParam.surfaceInfo = MOS_NewArray(CM_INDIRECT_SURFACE_INFO, m_usKernelPayloadSurfaceCount);
4613                 CM_CHK_NULL_GOTOFINISH(halKernelParam->indirectDataParam.surfaceInfo, CM_OUT_OF_HOST_MEMORY);
4614 
4615             }
4616             CmSafeMemCopy((void*)halKernelParam->indirectDataParam.surfaceInfo, (void*)m_IndirectSurfaceInfoArray,
4617                              m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4618             //clear m_IndirectSurfaceInfoArray every enqueue
4619             CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4620             m_usKernelPayloadSurfaceCount = 0;
4621         }
4622     }
4623 
4624     if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
4625     {
4626         if ( m_samplerBtiCount != 0 )
4627         {
4628             CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4629             halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4630 
4631             CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4632             m_samplerBtiCount = 0;
4633         }
4634     }
4635     CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4636 
4637     CM_CHK_CMSTATUS_GOTOFINISH(CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces));
4638 
4639     CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4640 
4641 finish:
4642     if( hr != CM_SUCCESS)
4643     {
4644         if( halKernelParam )
4645         {
4646             MosSafeDeleteArray(halKernelParam->indirectDataParam.indirectData);
4647             MosSafeDeleteArray(halKernelParam->indirectDataParam.surfaceInfo);
4648         }
4649     }
4650     return hr;
4651 }
4652 
4653 //*-----------------------------------------------------------------------------
4654 //| Purpose:    Update kernel data's kernel arg, thread arg, thread count
4655 //| Returns:    Result of the operation.
4656 //*-----------------------------------------------------------------------------
UpdateKernelData(CmKernelData * kernelData,const CmThreadGroupSpace * threadGroupSpace)4657 int32_t CmKernelRT::UpdateKernelData(
4658     CmKernelData*   kernelData,  // in
4659     const CmThreadGroupSpace* threadGroupSpace )    // in
4660 {
4661     int32_t               hr                      = CM_SUCCESS;
4662     PCM_HAL_KERNEL_PARAM  halKernelParam         = nullptr;
4663     uint32_t              argIndexStep            = 0;
4664     uint32_t              argIndex                = 0;
4665     uint32_t              surfNum                 = 0;
4666     auto getVersionAsInt = [](int major, int minor) { return major * 100 + minor; };
4667 
4668     CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
4669     CM_ASSERT(kernelData->IsInUse() == false);
4670 
4671     halKernelParam = kernelData->GetHalCmKernelData();
4672     CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4673 
4674     CM_CHK_NULL_GOTOFINISH_CMERROR(threadGroupSpace);
4675 
4676     //Update arguments
4677     for(uint32_t orgArgIndex =0 ; orgArgIndex< m_argCount; orgArgIndex++)
4678     {
4679         argIndexStep = 1;
4680 
4681         if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4682                         ARG_KIND_SURFACE,
4683                         ARG_KIND_SURFACE_1D,
4684                         ARG_KIND_SURFACE_2D,
4685                         ARG_KIND_SURFACE_2D_UP,
4686                         ARG_KIND_SURFACE_SAMPLER,
4687                         ARG_KIND_SURFACE2DUP_SAMPLER,
4688                         ARG_KIND_SURFACE_3D,
4689                         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4690                         ARG_KIND_SURFACE_SAMPLER8X8_VA,
4691                         ARG_KIND_SURFACE_2D_SCOREBOARD,
4692                         ARG_KIND_STATE_BUFFER ) )
4693         {
4694             argIndexStep = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array exists
4695         }
4696         else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4697         {
4698             argIndexStep = m_args[orgArgIndex].unitVmeArraySize;
4699         }
4700 
4701         if(m_args[ orgArgIndex ].isDirty)
4702         {
4703             if(m_args[ orgArgIndex ].unitCount > 1)
4704             { // thread arg is dirty
4705                 CM_ASSERTMESSAGE("Error: Thread arg is not allowed in GPGPU walker.");
4706                 hr = CM_FAILURE; // Thread arg is not allowed in GPGPU walker
4707                 goto finish;
4708             }
4709 
4710             if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4711                         ARG_KIND_SURFACE,
4712                         ARG_KIND_SURFACE_1D,
4713                         ARG_KIND_SURFACE_2D,
4714                         ARG_KIND_SURFACE_2D_UP,
4715                         ARG_KIND_SURFACE_SAMPLER,
4716                         ARG_KIND_SURFACE2DUP_SAMPLER,
4717                         ARG_KIND_SURFACE_3D,
4718                         ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4719                         ARG_KIND_SURFACE_SAMPLER8X8_VA,
4720                         ARG_KIND_SURFACE_2D_SCOREBOARD,
4721                         ARG_KIND_STATE_BUFFER ) )
4722             {  // for surface args
4723                 uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4724                 if(m_args[ orgArgIndex ].unitCount ==  1) // kernel arg
4725                 {
4726                     if (numSurfaces > 1 )
4727                     {
4728                         for(uint32_t kk=0;  kk< numSurfaces ; kk++)
4729                         {
4730                             CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue
4731                                       != nullptr);
4732                             CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4733                                           m_args[ orgArgIndex ].value + kk*sizeof(uint32_t),
4734                                           sizeof(uint32_t));
4735                             halKernelParam->argParams[argIndex + kk].aliasIndex
4736                                     = m_args[orgArgIndex].aliasIndex;
4737                             halKernelParam->argParams[argIndex + kk].aliasCreated
4738                                     = m_args[orgArgIndex].aliasCreated;
4739                             halKernelParam->argParams[argIndex + kk].isNull
4740                                     = m_args[orgArgIndex].isNull;
4741 
4742                             if (!m_args[orgArgIndex].surfIndex[kk])
4743                             {
4744                                 //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
4745                                 //This is for special usage if there is empty element in surface array.
4746                                 halKernelParam->argParams[argIndex + kk].kind = CM_ARGUMENT_SURFACE2D;
4747                                 continue;
4748                             }
4749                             halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4750                             halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].surfArrayArg[kk].argKindForArray;
4751                             halKernelParam->argParams[argIndex + kk].nCustomValue = m_args[orgArgIndex].surfArrayArg[kk].addressModeForArray;
4752 
4753                         }
4754                     }
4755                     else
4756                     {
4757                         CM_ASSERT(halKernelParam->argParams[argIndex].firstValue != nullptr);
4758                         halKernelParam->argParams[argIndex].kind
4759                                 = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4760                         halKernelParam->argParams[argIndex].aliasIndex
4761                                 = m_args[orgArgIndex].aliasIndex;
4762                         halKernelParam->argParams[argIndex].aliasCreated
4763                                 = m_args[orgArgIndex].aliasCreated;
4764                         halKernelParam->argParams[argIndex].isNull
4765                                 = m_args[orgArgIndex].isNull;
4766                         if (halKernelParam->argParams[argIndex].isNull)
4767                         {
4768                             *(halKernelParam->argParams[argIndex].firstValue)
4769                                     = 0;
4770                         }
4771                         else
4772                         {
4773                             CmSafeMemCopy(
4774                                 halKernelParam->argParams[argIndex].firstValue,
4775                                 m_args[orgArgIndex].value, sizeof(uint32_t));
4776                         }
4777                     }
4778                 }
4779             }
4780             else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4781             {
4782                 uint32_t numSurfaces = m_args[orgArgIndex].unitVmeArraySize;
4783                 if (m_args[orgArgIndex].unitCount == 1) // kernel arg
4784                 {
4785                     uint32_t vmeSurfOffset = 0;
4786                     for (uint32_t kk = 0; kk< numSurfaces; kk++)
4787                     {
4788                         uint32_t vmeSize = getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[orgArgIndex].value + vmeSurfOffset));
4789 
4790                         // reallocate the firstValue for VME surface every time
4791                         // since the number of surfaces may vary
4792                         MosSafeDeleteArray(halKernelParam->argParams[argIndex + kk].firstValue);
4793                         halKernelParam->argParams[argIndex + kk].firstValue = MOS_NewArray(uint8_t, vmeSize);
4794                         CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4795                         CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4796                             m_args[orgArgIndex].value + vmeSurfOffset, vmeSize);
4797 
4798                         halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4799 
4800                         halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4801                         halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4802                         halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4803                         halKernelParam->argParams[argIndex + kk].unitSize = m_args[orgArgIndex].unitSize;
4804                         vmeSurfOffset += vmeSize;
4805                     }
4806                 }
4807             }
4808             else
4809             {
4810                 CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadArgData(&halKernelParam->argParams[argIndex ], orgArgIndex, nullptr, m_args));
4811             }
4812         }
4813         argIndex += argIndexStep;
4814     }
4815 
4816     if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
4817     {
4818         if ( m_samplerBtiCount != 0 )
4819         {
4820             CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4821             halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4822 
4823             CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4824             m_samplerBtiCount = 0;
4825         }
4826     }
4827 
4828     CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4829 
4830     CM_CHK_CMSTATUS_GOTOFINISH(CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces));
4831 
4832     // GPGPU walker - implicit args
4833     uint32_t thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth;
4834     threadGroupSpace->GetThreadGroupSpaceSize(thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth);
4835 
4836     halKernelParam->gpgpuWalkerParams.groupDepth = grpSpaceDepth;
4837     halKernelParam->gpgpuWalkerParams.groupHeight = grpSpaceHeight;
4838     halKernelParam->gpgpuWalkerParams.groupWidth  = grpSpaceWidth;
4839     halKernelParam->gpgpuWalkerParams.threadDepth = thrdSpaceDepth;
4840     halKernelParam->gpgpuWalkerParams.threadWidth  = thrdSpaceWidth;
4841     halKernelParam->gpgpuWalkerParams.threadHeight = thrdSpaceHeight;
4842 
4843     if (getVersionAsInt(m_program->m_cisaMajorVersion, m_program->m_cisaMinorVersion) < getVersionAsInt(3, 3))
4844     {
4845         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 0].firstValue, thrdSpaceWidth));
4846         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 1].firstValue, thrdSpaceHeight));
4847         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 2].firstValue, grpSpaceWidth));
4848         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 3].firstValue, grpSpaceHeight));
4849         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 4].firstValue, thrdSpaceWidth));
4850         CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 5].firstValue, thrdSpaceHeight));
4851     }
4852 
4853     CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4854 finish:
4855     return hr;
4856 }
4857 
4858 //*-----------------------------------------------------------------------------
4859 //| Purpose:    Create kernel indirect data
4860 //| Returns:    Result of the operation.
4861 //*-----------------------------------------------------------------------------
CreateKernelIndirectData(PCM_HAL_INDIRECT_DATA_PARAM halIndirectData)4862 int32_t CmKernelRT::CreateKernelIndirectData(
4863     PCM_HAL_INDIRECT_DATA_PARAM  halIndirectData )    // in/out
4864 {
4865     int32_t hr = CM_SUCCESS;
4866 
4867     halIndirectData->indirectDataSize = m_usKernelPayloadDataSize;
4868     halIndirectData->surfaceCount     = m_usKernelPayloadSurfaceCount;
4869 
4870     if( halIndirectData->indirectData == nullptr &&  m_usKernelPayloadDataSize != 0)
4871     {
4872         halIndirectData->indirectData = MOS_NewArray(uint8_t, halIndirectData->indirectDataSize);
4873         CM_CHK_NULL_GOTOFINISH(halIndirectData->indirectData, CM_OUT_OF_HOST_MEMORY);
4874     }
4875 
4876     // For future kernel data, pKbyte is starting point
4877     if( halIndirectData->surfaceInfo == nullptr &&  m_usKernelPayloadSurfaceCount != 0)
4878     {
4879         halIndirectData->surfaceInfo = MOS_NewArray(CM_INDIRECT_SURFACE_INFO, halIndirectData->surfaceCount);
4880         CM_CHK_NULL_GOTOFINISH(halIndirectData->surfaceInfo, CM_OUT_OF_HOST_MEMORY);
4881     }
4882 
4883     if(m_usKernelPayloadDataSize != 0)
4884     {
4885         CmSafeMemCopy(halIndirectData->indirectData, (void *)m_kernelPayloadData, m_usKernelPayloadDataSize);
4886     }
4887 
4888     if(m_usKernelPayloadSurfaceCount != 0)
4889     {
4890         CmSafeMemCopy((void*)halIndirectData->surfaceInfo, (void*)m_IndirectSurfaceInfoArray,
4891                     m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4892         //clear m_IndirectSurfaceInfoArray every enqueue
4893         CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4894         m_usKernelPayloadSurfaceCount = 0;
4895     }
4896 finish:
4897     if( hr != CM_SUCCESS)
4898     {
4899         if(halIndirectData->indirectData)                 MosSafeDeleteArray(halIndirectData->indirectData);
4900         if(halIndirectData->surfaceInfo)                  MosSafeDeleteArray(halIndirectData->surfaceInfo);
4901     }
4902     return hr;
4903 }
4904 
4905 //*-----------------------------------------------------------------------------
4906 //| Purpose:    UpdateLastKernelData
4907 //| Returns:    Result of the operation.
4908 //*-----------------------------------------------------------------------------
UpdateLastKernelData(CmKernelData * & kernelData)4909 int32_t CmKernelRT::UpdateLastKernelData(
4910     CmKernelData* & kernelData)    // in
4911 {
4912     int32_t hr = CM_SUCCESS;
4913 
4914     if( kernelData == nullptr || m_lastKernelData == kernelData )
4915     {
4916         CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
4917         return CM_NULL_POINTER;
4918     }
4919 
4920     if(m_lastKernelData)
4921     {
4922         CmKernelData::Destroy(m_lastKernelData); // reduce ref count or delete it
4923     }
4924     CSync* kernelLock = m_device->GetProgramKernelLock();
4925     CLock locker(*kernelLock);
4926     m_lastKernelData = kernelData;
4927     m_lastKernelData->Acquire();
4928     m_lastKernelDataSize = m_lastKernelData->GetKernelDataSize();
4929 
4930     return hr;
4931 }
4932 
4933 //*-----------------------------------------------------------------------------
4934 //| Purpose:    Wrapper of  CmKernelData::Destroy.
4935 //| Returns:    Result of the operation.
4936 //*-----------------------------------------------------------------------------
ReleaseKernelData(CmKernelData * & kernelData)4937 int32_t CmKernelRT::ReleaseKernelData(
4938     CmKernelData* & kernelData)
4939 {
4940     int32_t hr = CM_SUCCESS;
4941 
4942     if( kernelData == nullptr)
4943     {
4944         CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
4945         return CM_NULL_POINTER;
4946     }
4947 
4948     CSync* kernelLock = m_device->GetProgramKernelLock();
4949     CLock locker(*kernelLock);
4950 
4951     if(m_lastKernelData == kernelData)
4952     {
4953         // If the kernel data is the last kernel data
4954         // Need to update m_lastKernelData.
4955         hr = CmKernelData::Destroy(m_lastKernelData);
4956     }
4957     else
4958     {
4959         hr = CmKernelData::Destroy(kernelData);
4960     }
4961 
4962     return hr;
4963 }
4964 
4965 //*-----------------------------------------------------------------------------
4966 //| Purpose:   Acquire Kernel and Program
4967 //*-----------------------------------------------------------------------------
AcquireKernelProgram()4968 int32_t CmKernelRT::AcquireKernelProgram()
4969 {
4970     CSync* kernelLock = m_device->GetProgramKernelLock();
4971     CLock locker(*kernelLock);
4972 
4973     this->Acquire(); // increase kernel's ref count
4974     m_program->Acquire(); // increase program's ref count
4975 
4976     return CM_SUCCESS;
4977 }
4978 
4979 //*-----------------------------------------------------------------------------
4980 //| Purpose:   Acquire KenrelData, Kernel and Program
4981 //*-----------------------------------------------------------------------------
AcquireKernelData(CmKernelData * & kernelData)4982 int32_t CmKernelRT::AcquireKernelData(
4983     CmKernelData * &kernelData)
4984 {
4985     int32_t hr = CM_SUCCESS;
4986 
4987     if (kernelData == nullptr)
4988     {
4989         CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
4990         return CM_NULL_POINTER;
4991     }
4992 
4993     CSync* kernelLock = m_device->GetProgramKernelLock();
4994     CLock locker(*kernelLock);
4995     kernelData->Acquire(); // increase kernel data's ref count
4996 
4997     return hr;
4998 }
4999 
SetAsClonedKernel(uint32_t cloneKernelID)5000 void CmKernelRT::SetAsClonedKernel(uint32_t cloneKernelID)
5001 {
5002     m_isClonedKernel = true;
5003     m_cloneKernelID = cloneKernelID;
5004 }
5005 
GetCloneKernelID(uint32_t & cloneKernelID)5006 bool CmKernelRT::GetCloneKernelID(uint32_t& cloneKernelID)
5007 {
5008     if (m_isClonedKernel)
5009     {
5010         cloneKernelID = m_cloneKernelID;
5011         return true;
5012     }
5013 
5014     return false;
5015 }
5016 
SetHasClones()5017 void CmKernelRT::SetHasClones()
5018 {
5019     m_hasClones = true;
5020 }
5021 
5022 //*-----------------------------------------------------------------------------
5023 //| Purpose:   Clone/copy current kernel
5024 //| Returns:   New kernel with content of source kernel
5025 //*-----------------------------------------------------------------------------
CloneKernel(CmKernelRT * & kernelOut,uint32_t id)5026 int32_t CmKernelRT::CloneKernel(CmKernelRT *& kernelOut, uint32_t id)
5027 {
5028     int32_t hr = CM_SUCCESS;
5029 
5030     CSync* kernelLock = m_device->GetProgramKernelLock();
5031     CLock locker(*kernelLock);
5032 
5033     CmDynamicArray * kernelArray = m_device->GetKernelArray();
5034 
5035     uint32_t freeSlotinKernelArray = kernelArray->GetFirstFreeIndex();
5036 
5037     hr = Create(m_device, m_program, (char*)GetName(), freeSlotinKernelArray, id, kernelOut, m_options);
5038 
5039     if (hr == CM_SUCCESS)
5040     {
5041         kernelOut->SetAsClonedKernel(m_id >> 32);
5042         kernelArray->SetElement(freeSlotinKernelArray, kernelOut);
5043         uint32_t *kernelCount = m_device->GetKernelCount();
5044         *kernelCount = *kernelCount + 1;
5045 
5046         SetHasClones();
5047     }
5048 
5049     return hr;
5050 }
5051 
5052 //*-----------------------------------------------------------------------------
5053 //| Purpose:    Set Kernel's index in one task
5054 //| Returns:    Result of the operation.
5055 //*-----------------------------------------------------------------------------
SetIndexInTask(uint32_t index)5056 int32_t CmKernelRT::SetIndexInTask(uint32_t index)
5057 {
5058     m_indexInTask = index;
5059     return CM_SUCCESS;
5060 }
5061 
5062 //*-----------------------------------------------------------------------------
5063 //| Purpose:    Get Kernel's index in one task
5064 //| Returns:    Result of the operation.
5065 //*-----------------------------------------------------------------------------
GetIndexInTask(void)5066 uint32_t CmKernelRT::GetIndexInTask(void)
5067 {
5068     return m_indexInTask;
5069 }
5070 
5071 //*-----------------------------------------------------------------------------
5072 //| Purpose:    Set Associated Flag
5073 //| Returns:    Result of the operation.
5074 //*-----------------------------------------------------------------------------
SetAssociatedToTSFlag(bool b)5075 int32_t CmKernelRT::SetAssociatedToTSFlag(bool b)
5076 {
5077     m_threadSpaceAssociated = b;
5078     return CM_SUCCESS;
5079 }
5080 
5081 //*-----------------------------------------------------------------------------
5082 //| Purpose: Set threadspace for kernel
5083 //| Returns: Result of the operation.
5084 //| Note: It's exclusive with AssociateThreadGroupSpace()
5085 //*-----------------------------------------------------------------------------
AssociateThreadSpace(CmThreadSpace * & threadSpace)5086 CM_RT_API int32_t CmKernelRT::AssociateThreadSpace(CmThreadSpace *&threadSpace)
5087 {
5088     if( threadSpace == nullptr )
5089     {
5090         CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5091         return CM_INVALID_ARG_VALUE;
5092     }
5093 
5094     PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5095     if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
5096     {
5097         CmThreadSpaceRT *threadSpaceRTConst = static_cast<CmThreadSpaceRT *>(threadSpace);
5098         if (threadSpaceRTConst == nullptr)
5099         {
5100             CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5101             return CM_INVALID_ARG_VALUE;
5102         }
5103         CmThreadGroupSpace *threadGroupSpace = threadSpaceRTConst->GetThreadGroupSpace();
5104         return AssociateThreadGroupSpace(threadGroupSpace);
5105     }
5106     else
5107     {
5108         if (m_threadGroupSpace != nullptr)
5109         {
5110             CM_ASSERTMESSAGE("Error: It's exclusive with AssociateThreadGroupSpace().");
5111             return CM_INVALID_KERNEL_THREADSPACE;
5112         }
5113     }
5114 
5115     bool threadSpaceChanged = false;
5116     if( m_threadSpace )
5117     {
5118         if( m_threadSpace != static_cast<CmThreadSpaceRT *>(threadSpace) )
5119         {
5120             threadSpaceChanged = true;
5121         }
5122     }
5123 
5124     m_threadSpace = static_cast<CmThreadSpaceRT *>(threadSpace);
5125 
5126     uint32_t threadSpaceWidth = 0;
5127     uint32_t threadSpaceHeight = 0;
5128     m_threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
5129     uint32_t threadCount = threadSpaceWidth * threadSpaceHeight;
5130     if (m_threadCount)
5131     {
5132         // Setting threadCount twice with different values will cause reset of kernels
5133         if (m_threadCount != threadCount)
5134         {
5135             m_threadCount = threadCount;
5136             m_dirty |= CM_KERNEL_DATA_THREAD_COUNT_DIRTY;
5137         }
5138     }
5139     else // first time
5140     {
5141         m_threadCount = threadCount;
5142     }
5143 
5144     if( threadSpaceChanged )
5145     {
5146         m_threadSpace->SetDirtyStatus( CM_THREAD_SPACE_DATA_DIRTY);
5147     }
5148 
5149     return CM_SUCCESS;
5150 }
5151 
5152 //*-----------------------------------------------------------------------------
5153 //| Purpose: Set thread group space for kernel
5154 //| Returns: Result of the operation.
5155 //| Note: It's exclusive with AssociateThreadSpace()
5156 //*-----------------------------------------------------------------------------
AssociateThreadGroupSpace(CmThreadGroupSpace * & threadGroupSpace)5157 CM_RT_API int32_t CmKernelRT::AssociateThreadGroupSpace(CmThreadGroupSpace *&threadGroupSpace)
5158 {
5159     if( threadGroupSpace == nullptr )
5160     {
5161         CM_ASSERTMESSAGE("Error: Invalid null pointer.");
5162         return CM_INVALID_ARG_VALUE;
5163     }
5164 
5165     if (m_threadSpace != nullptr)
5166     {
5167         CM_ASSERTMESSAGE("Error: It's exclusive with AssociateThreadSpace().");
5168         return CM_INVALID_KERNEL_THREADGROUPSPACE;
5169     }
5170 
5171     m_threadGroupSpace = threadGroupSpace;
5172 
5173     return CM_SUCCESS;
5174 }
5175 
5176 //*-----------------------------------------------------------------------------
5177 //| Purpose: Clear threadspace for kernel
5178 //| Returns: Result of the operation.
5179 //*-----------------------------------------------------------------------------
DeAssociateThreadSpace(CmThreadSpace * & threadSpace)5180 CM_RT_API int32_t CmKernelRT::DeAssociateThreadSpace(CmThreadSpace * &threadSpace)
5181 {
5182     if (threadSpace == nullptr)
5183     {
5184         CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5185         return CM_NULL_POINTER;
5186     }
5187 
5188     PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5189     if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
5190     {
5191         CmThreadSpaceRT *threadSpaceRTConst = static_cast<CmThreadSpaceRT *>(threadSpace);
5192         if (threadSpaceRTConst == nullptr)
5193         {
5194             CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5195             return CM_INVALID_ARG_VALUE;
5196         }
5197 
5198         CmThreadGroupSpace *threadGroupSpace = threadSpaceRTConst->GetThreadGroupSpace();
5199         if (m_threadGroupSpace != threadGroupSpace)
5200         {
5201             CM_ASSERTMESSAGE("Error: Invalid thread group space handle.");
5202             return CM_INVALID_ARG_VALUE;
5203         }
5204         m_threadGroupSpace = nullptr;
5205     }
5206     else
5207     {
5208         if (m_threadSpace != static_cast<CmThreadSpaceRT *>(threadSpace))
5209         {
5210             CM_ASSERTMESSAGE("Error: Invalid thread space handle.");
5211             return CM_INVALID_ARG_VALUE;
5212         }
5213         m_threadSpace = nullptr;
5214     }
5215 
5216     return CM_SUCCESS;
5217 }
5218 //*--------------------------------------------------------------------------------------------
5219 //| Purpose: query spill memory size, the function can only take effect when jitter is enabled
5220 //| Return: Result of the operation.
5221 //*---------------------------------------------------------------------------------------------
5222 
QuerySpillSize(uint32_t & spillMemorySize)5223 CM_RT_API int32_t CmKernelRT::QuerySpillSize(uint32_t &spillMemorySize)
5224 {
5225     CM_KERNEL_INFO  *kernelInfo = nullptr;
5226     int32_t kernelStartIndex = m_program->GetKernelStartIndex();
5227 
5228     int32_t hr = m_program->GetKernelInfo(m_kernelIndexInProgram, kernelInfo);
5229     if (hr != CM_SUCCESS || kernelInfo == nullptr)
5230         return hr;
5231 
5232     if (m_program->IsJitterEnabled()) {
5233         if (kernelInfo->jitInfo != nullptr) {
5234             spillMemorySize = (kernelInfo->jitInfo)->spillMemUsed;
5235             return hr;
5236         }
5237         else
5238             return CM_FAILURE;
5239     }
5240 
5241     return CM_FAILURE;
5242 }
5243 
5244 //*-----------------------------------------------------------------------------
5245 //| Purpose: Clear threadgroupspace for kernel
5246 //| Returns: Result of the operation.
5247 //*-----------------------------------------------------------------------------
DeAssociateThreadGroupSpace(CmThreadGroupSpace * & threadGroupSpace)5248 int32_t CmKernelRT::DeAssociateThreadGroupSpace(CmThreadGroupSpace * &threadGroupSpace)
5249 {
5250     if (threadGroupSpace == nullptr)
5251     {
5252         CM_ASSERTMESSAGE("Error: Invalid null pointer.");
5253         return CM_NULL_POINTER;
5254     }
5255     if (m_threadGroupSpace != threadGroupSpace)
5256     {
5257         CM_ASSERTMESSAGE("Error: Invalid thread group space handle.");
5258         return CM_INVALID_ARG_VALUE;
5259     }
5260     m_threadGroupSpace = nullptr;
5261     m_dirty            = CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY;
5262 
5263     return CM_SUCCESS;
5264 }
5265 
5266 //*-----------------------------------------------------------------------------
5267 //| Purpose:    Indicate whether thread arg existed.
5268 //| Returns:    Result of the operation.
5269 //*-----------------------------------------------------------------------------
IsThreadArgExisted()5270 bool CmKernelRT::IsThreadArgExisted()
5271 {
5272     return m_perThreadArgExists;
5273 }
5274 
5275 //*-----------------------------------------------------------------------------
5276 //| Purpose:    Get the size of SharedLocalMemory
5277 //| Returns:    Result of the operation.
5278 //*-----------------------------------------------------------------------------
GetSLMSize()5279 uint32_t CmKernelRT::GetSLMSize()
5280 {
5281     return (uint32_t)m_kernelInfo->kernelSLMSize;
5282 }
5283 
5284 //*-----------------------------------------------------------------------------
5285 //| Purpose:    Get the spill size of the kernel from JIT
5286 //| Returns:    Result of the operation.
5287 //*-----------------------------------------------------------------------------
GetSpillMemUsed()5288 uint32_t CmKernelRT::GetSpillMemUsed()
5289 {
5290     uint32_t spillSize;
5291 
5292     if (m_program->IsJitterEnabled() && m_kernelInfo->jitInfo != nullptr)
5293     {
5294         spillSize = (m_kernelInfo->jitInfo)->spillMemUsed;
5295     }
5296     else
5297     {
5298         // kernel uses "--nojitter" option, don't allocate scratch space
5299         spillSize = 0;
5300     }
5301 
5302     return spillSize;
5303 }
5304 
SearchAvailableIndirectSurfInfoTableEntry(uint16_t kind,uint32_t surfaceIndex,uint32_t bti)5305 int32_t CmKernelRT::SearchAvailableIndirectSurfInfoTableEntry(uint16_t kind, uint32_t surfaceIndex, uint32_t bti)
5306 {
5307     uint16_t i = 0;
5308     for ( i = 0; i < CM_MAX_STATIC_SURFACE_STATES_PER_BT; i++ )
5309     {
5310         if ( ( ( m_IndirectSurfaceInfoArray[ i ].surfaceIndex == surfaceIndex ) && ( m_IndirectSurfaceInfoArray[ i ].kind == kind ) && ( m_IndirectSurfaceInfoArray[ i ].bindingTableIndex == bti ) ) ||
5311             ( ( m_IndirectSurfaceInfoArray[ i ].surfaceIndex == 0 ) && ( m_IndirectSurfaceInfoArray[ i ].kind == 0 ) ) )
5312         {
5313             return i;
5314         }
5315     }
5316     // should never reach this
5317     CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5318     return CM_FAILURE;
5319 }
5320 
5321 //-----------------------------------------------------------------------------------------------------------------
5322 //! Set surface binding table index count for each indirect surface
5323 //! INPUT:
5324 //!     1) Surface format
5325 //!     2) Surface type.
5326 //! OUTPUT:
5327 //!     binding table index count
5328 //-----------------------------------------------------------------------------------------------------------------
SetSurfBTINumForIndirectData(CM_SURFACE_FORMAT format,CM_ENUM_CLASS_TYPE surfaceType)5329 int32_t CmKernelRT::SetSurfBTINumForIndirectData(CM_SURFACE_FORMAT format, CM_ENUM_CLASS_TYPE surfaceType)
5330 {
5331     if (surfaceType == CM_ENUM_CLASS_TYPE_CMBUFFER_RT)
5332     {
5333         return 1;
5334     }
5335     else
5336     {
5337         if ((format == CM_SURFACE_FORMAT_NV12) ||
5338             (format == CM_SURFACE_FORMAT_P010) ||
5339             (format == CM_SURFACE_FORMAT_P208) ||
5340             (format == CM_SURFACE_FORMAT_P016))
5341         {
5342             return 2;
5343         }
5344         else if (format == CM_SURFACE_FORMAT_422H ||
5345             format == CM_SURFACE_FORMAT_411P ||
5346             format == CM_SURFACE_FORMAT_IMC3 ||
5347             format == CM_SURFACE_FORMAT_422V ||
5348             format == CM_SURFACE_FORMAT_444P)
5349         {   // 3 planes surface
5350             return 3;
5351         }
5352         else
5353         {
5354             return 1;
5355         }
5356     }
5357     // should never reach this
5358     CM_ASSERTMESSAGE("Error: Set surface binding table index count failure.");
5359     return 0;
5360 }
5361 
5362 //-----------------------------------------------------------------------------------------------------------------
5363 //! Set surface binding table index by user.
5364 //! If application hope to assign a specific binding table index for a surface, it should call this function.
5365 //! The assigned binding table index should be an valid value for general surface ( say >=1 and <=242),
5366 //! otherwise, this call will return failure.
5367 //! INPUT:
5368 //!     1) Surface whose binding table index need be set.
5369 //!     2) Assiend binding table index.
5370 //! OUTPUT:
5371 //!     CM_SUCCESS
5372 //!     CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX if the surface index is not a valid binding table index (valid: 1~242)
5373 //!     CM_FAILURE otherwise
5374 //-----------------------------------------------------------------------------------------------------------------
SetSurfaceBTI(SurfaceIndex * surface,uint32_t btIndex)5375 CM_RT_API int32_t CmKernelRT::SetSurfaceBTI(SurfaceIndex* surface, uint32_t btIndex)
5376 {
5377 
5378     uint32_t                    width, height, bytesPerPixel;
5379     CM_SURFACE_FORMAT           format = CM_SURFACE_FORMAT_INVALID;
5380     //Sanity check
5381     if (surface == nullptr)
5382     {
5383         CM_ASSERTMESSAGE("Error: Pointer to surface is null.");
5384         return CM_NULL_POINTER;
5385     }
5386     if (!m_surfaceMgr->IsValidSurfaceIndex(btIndex))
5387     {
5388         CM_ASSERTMESSAGE("Error: Invalid binding table index.");
5389         return CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX;
5390     }
5391 
5392     //Sanity check: if the BTI has been used once enqueue
5393     uint32_t i = 0;
5394     for (i = 0; i < m_usKernelPayloadSurfaceCount; i++)
5395     {
5396         if (m_IndirectSurfaceInfoArray[i].bindingTableIndex == (uint16_t)btIndex)
5397         {
5398             CM_ASSERTMESSAGE("Error: Binding table index has been used once enqueue.");
5399             return CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX;
5400         }
5401     }
5402 
5403     uint32_t index = surface->get_data();
5404     uint32_t handle = 0;
5405 
5406     CmSurface* surfaceRT = nullptr;
5407     m_surfaceMgr->GetSurface( index, surfaceRT );
5408     if(surfaceRT == nullptr)
5409     {
5410         CM_ASSERTMESSAGE("Error: Invalid surface.");
5411         return CM_NULL_POINTER;
5412     }
5413 
5414     CmSurface2DRT* surf2D = nullptr;
5415     uint32_t indirectSurfInfoEntry = 0;
5416     if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACE2D )
5417     {
5418         surf2D = static_cast< CmSurface2DRT* >( surfaceRT );
5419         surf2D->GetHandle( handle );
5420         indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_2D, handle, btIndex);
5421         if (indirectSurfInfoEntry == CM_FAILURE)
5422         {
5423             CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5424             return CM_FAILURE;
5425         }
5426         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_2D;
5427         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5428         surf2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5429     }
5430     else
5431     {
5432         CmBuffer_RT* cmBuffer = nullptr;
5433         if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMBUFFER_RT )
5434         {
5435             cmBuffer = static_cast< CmBuffer_RT* >( surfaceRT );
5436             cmBuffer->GetHandle( handle );
5437             indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_1D, handle, btIndex);
5438             if (indirectSurfInfoEntry == CM_FAILURE)
5439             {
5440                 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5441                 return CM_FAILURE;
5442             }
5443             m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_1D;
5444             m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5445         }
5446         else
5447         {
5448             CmSurface2DUPRT* surf2DUP = nullptr;
5449             if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACE2DUP )
5450             {
5451                 surf2DUP = static_cast< CmSurface2DUPRT* >( surfaceRT );
5452                 surf2DUP->GetHandle( handle );
5453                 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_2D_UP, handle, btIndex);
5454                 if (indirectSurfInfoEntry == CM_FAILURE)
5455                 {
5456                     CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5457                     return CM_FAILURE;
5458                 }
5459                 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_2D_UP;
5460                 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5461                 surf2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5462             }
5463             else
5464             {
5465                 CmSurfaceSampler* surfSampler = nullptr;
5466                 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER )
5467                 {
5468                     surfSampler = static_cast< CmSurfaceSampler* >(surfaceRT);
5469 
5470                     //Get  actually SurfaceIndex ID for 2D
5471                     uint16_t surfIndexForCurrent = 0;
5472                     surfSampler->GetCmIndexCurrent(surfIndexForCurrent);
5473                     CmSurface* surfSampRT= nullptr;
5474                     m_surfaceMgr->GetSurface(surfIndexForCurrent, surfSampRT);
5475                     if(surfSampRT == nullptr)
5476                     {
5477                         CM_ASSERTMESSAGE("Error: Invalid surface.");
5478                         return CM_NULL_POINTER;
5479                     }
5480 
5481                     SAMPLER_SURFACE_TYPE surfaceType;
5482                     surfSampler->GetSurfaceType(surfaceType);
5483                     surfSampler->GetHandle( handle );
5484                     if ( surfaceType == SAMPLER_SURFACE_TYPE_2D )
5485                     {
5486                         CmSurface2DRT* surfSamp2D = nullptr;
5487                         surfSamp2D = static_cast<CmSurface2DRT*>(surfSampRT);
5488                         surfSamp2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5489 
5490                         indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER, handle, btIndex);
5491                         if (indirectSurfInfoEntry == CM_FAILURE)
5492                         {
5493                             CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5494                             return CM_FAILURE;
5495                         }
5496                         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER;
5497                     }
5498                     else if ( surfaceType == SAMPLER_SURFACE_TYPE_2DUP )
5499                     {
5500                         CmSurface2DUPRT* surfSamp2DUP = nullptr;
5501                         surfSamp2DUP = static_cast<CmSurface2DUPRT*>(surfSampRT);
5502                         surfSamp2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5503 
5504                         indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE2DUP_SAMPLER, handle, btIndex);
5505                         if (indirectSurfInfoEntry == CM_FAILURE)
5506                         {
5507                             CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5508                             return CM_FAILURE;
5509                         }
5510                         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE2DUP_SAMPLER;
5511                     }
5512                     else if ( surfaceType == SAMPLER_SURFACE_TYPE_3D )
5513                     {
5514                         indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_3D, handle, btIndex);
5515                         if (indirectSurfInfoEntry == CM_FAILURE)
5516                         {
5517                             CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5518                             return CM_FAILURE;
5519                         }
5520                         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_3D;
5521                     }
5522                     m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5523                 }
5524                 else
5525                 {
5526                     CmSurfaceSampler8x8* surfSampler8x8 = nullptr;
5527                     if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8 )
5528                     {
5529                         surfSampler8x8 = static_cast< CmSurfaceSampler8x8* >( surfaceRT );
5530                         surfSampler8x8->GetIndexCurrent( handle );
5531 
5532                         //Get  actually SurfaceIndex ID for 2D
5533                         uint16_t surfIndexForCurrent = 0;
5534                         surfSampler8x8->GetCmIndex(surfIndexForCurrent);
5535                         CmSurface* surfSamp8x8RT = nullptr;
5536                         m_surfaceMgr->GetSurface(surfIndexForCurrent, surfSamp8x8RT);
5537                         if(surfSamp8x8RT == nullptr)
5538                         {
5539                             CM_ASSERTMESSAGE("Error: Invalid surface.");
5540                             return CM_NULL_POINTER;
5541                         }
5542 
5543                         CmSurface2DRT* surfSamp8x82D = nullptr;
5544                         surfSamp8x82D = static_cast<CmSurface2DRT*>(surfSamp8x8RT);
5545                         surfSamp8x82D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5546 
5547                         if ( surfSampler8x8->GetSampler8x8SurfaceType() == CM_AVS_SURFACE )
5548                         {
5549                             indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER8X8_AVS, handle, btIndex);
5550                             if (indirectSurfInfoEntry == CM_FAILURE)
5551                             {
5552                                 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5553                                 return CM_FAILURE;
5554                             }
5555                             m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
5556                         }
5557                         else if ( surfSampler8x8->GetSampler8x8SurfaceType() == CM_VA_SURFACE )
5558                         {
5559                             indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER8X8_VA, handle, btIndex);
5560                             if (indirectSurfInfoEntry == CM_FAILURE)
5561                             {
5562                                 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5563                                 return CM_FAILURE;
5564                             }
5565                             m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
5566                         }
5567                         m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5568                     }
5569                     else
5570                     {
5571                             return CM_FAILURE;
5572                     }
5573                 }
5574             }
5575         }
5576     }
5577 
5578     m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].bindingTableIndex = (uint16_t)btIndex;
5579     if (SetSurfBTINumForIndirectData(format, surfaceRT->Type())== 0)
5580     {
5581         CM_ASSERTMESSAGE("Error: Set surface binding table index count failure.");
5582         return CM_FAILURE;
5583     }
5584     m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].numBTIPerSurf = (uint16_t)SetSurfBTINumForIndirectData(format, surfaceRT->Type());
5585 
5586     //Copy it to surface index array
5587 
5588     m_pKernelPayloadSurfaceArray[indirectSurfInfoEntry] = surface;
5589 
5590 
5591     // count is actally one larger than the actual index
5592     m_usKernelPayloadSurfaceCount = indirectSurfInfoEntry + 1;
5593     m_dirty |= (CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY | CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY);
5594     return CM_SUCCESS;
5595 }
5596 
GetKernelIndex()5597 uint32_t CmKernelRT::GetKernelIndex()
5598 {
5599     return m_kernelIndex;
5600 }
GetKernelGenxBinarySize(void)5601 uint32_t CmKernelRT::GetKernelGenxBinarySize(void)
5602 {
5603     if(m_kernelInfo == nullptr)
5604     {
5605         CM_ASSERTMESSAGE("Error: Invalid kernel genx binary size.");
5606         return 0;
5607     }
5608     else
5609     {
5610         return m_kernelInfo->genxBinarySize;
5611     }
5612 }
5613 
5614 //-----------------------------------------------------------------------------------------------------------------
5615 //! Map Surface type to Kernel arg Kind.
5616 //! INPUT:  Surface type    :CM_ENUM_CLASS_TYPE
5617 //! OUTPUT: Kernel arg Kind :CM_ARG_KIND
5618 //-----------------------------------------------------------------------------------------------------------------
SurfTypeToArgKind(CM_ENUM_CLASS_TYPE surfType)5619 CM_ARG_KIND CmKernelRT::SurfTypeToArgKind(CM_ENUM_CLASS_TYPE surfType)
5620 {
5621     switch(surfType)
5622     {
5623         case CM_ENUM_CLASS_TYPE_CMBUFFER_RT          :return ARG_KIND_SURFACE_1D;
5624         case CM_ENUM_CLASS_TYPE_CMSURFACE2D          :return ARG_KIND_SURFACE_2D;
5625         case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP        :return ARG_KIND_SURFACE_2D_UP;
5626         case CM_ENUM_CLASS_TYPE_CMSURFACE3D          :return ARG_KIND_SURFACE_3D;
5627         case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER     :return ARG_KIND_SURFACE_SAMPLER;
5628         case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8  :return ARG_KIND_SURFACE_SAMPLER8X8_AVS;
5629         case CM_ENUM_CLASS_TYPE_CMSURFACEVME         :return ARG_KIND_SURFACE_VME;
5630         case CM_ENUM_CLASS_TYPE_CMSAMPLER_RT         :return ARG_KIND_SAMPLER;
5631         case CM_ENUM_CLASS_TYPE_CMSAMPLER8X8STATE_RT :return ARG_KIND_SAMPLER;
5632         case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER      :return ARG_KIND_STATE_BUFFER;
5633 
5634         default:
5635             CM_ASSERTMESSAGE("Error: Invalid surface type.");
5636             break;
5637    }
5638    return ARG_KIND_GENERAL;
5639 }
5640 
CalculateKernelSurfacesNum(uint32_t & kernelSurfaceNum,uint32_t & neededBTEntryNum)5641 int32_t CmKernelRT::CalculateKernelSurfacesNum(uint32_t& kernelSurfaceNum, uint32_t& neededBTEntryNum)
5642 {
5643     uint32_t            surfaceArraySize = 0;
5644     CmSurface*          surf = nullptr;
5645     CmSurface2DRT*        surf2D = nullptr;
5646     CmSurface2DUPRT*      surf2DUP = nullptr;
5647     uint32_t              width, height, bytesPerPixel;
5648     CM_SURFACE_FORMAT     format;
5649     uint32_t              maxBTIndex = 0;
5650 
5651     kernelSurfaceNum = 0;
5652     neededBTEntryNum = 0;
5653 
5654     surfaceArraySize = m_surfaceMgr->GetSurfacePoolSize();
5655 
5656     //Calculate surface number and needed binding table entries
5657     for (uint32_t surfIndex = 0; surfIndex <= m_maxSurfaceIndexAllocated; surfIndex ++)
5658     {
5659         if (m_surfaceArray[surfIndex%surfaceArraySize])
5660         {
5661             surf = nullptr;
5662             m_surfaceMgr->GetSurface(surfIndex, surf);
5663             if (surf)
5664             {
5665                 switch(surf->Type())
5666                 {
5667                     case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
5668                     case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
5669                         kernelSurfaceNum ++;
5670                         neededBTEntryNum ++;
5671                         break;
5672 
5673                     case CM_ENUM_CLASS_TYPE_CMSURFACEVME:
5674                     case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
5675                     case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
5676                         //virtual surface, no need increase count
5677                         break;
5678 
5679                     case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
5680                         kernelSurfaceNum++;
5681                         surf2D = static_cast<CmSurface2DRT*>(surf);
5682                         format = CM_SURFACE_FORMAT_INVALID;
5683                         surf2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5684                         if ((format == CM_SURFACE_FORMAT_NV12) ||
5685                             (format == CM_SURFACE_FORMAT_P010) ||
5686                             (format == CM_SURFACE_FORMAT_P208) ||
5687                             (format == CM_SURFACE_FORMAT_P016))
5688                         {
5689                             neededBTEntryNum += 2;
5690                         }
5691                         else if (format == CM_SURFACE_FORMAT_422H ||
5692                             format == CM_SURFACE_FORMAT_411P ||
5693                             format == CM_SURFACE_FORMAT_IMC3 ||
5694                             format == CM_SURFACE_FORMAT_422V ||
5695                             format == CM_SURFACE_FORMAT_444P)
5696                         {   // 3 planes surface
5697                             neededBTEntryNum += 3;
5698                         }
5699                         else
5700                         {
5701                             neededBTEntryNum += 1;
5702                         }
5703                         break;
5704 
5705                     case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
5706                         kernelSurfaceNum++;
5707                         surf2DUP = static_cast<CmSurface2DUPRT*>(surf);
5708                         format = CM_SURFACE_FORMAT_INVALID;
5709                         surf2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5710                         if ((format == CM_SURFACE_FORMAT_NV12) ||
5711                             (format == CM_SURFACE_FORMAT_P010) ||
5712                             (format == CM_SURFACE_FORMAT_P208) ||
5713                             (format == CM_SURFACE_FORMAT_P016))
5714                         {
5715                             neededBTEntryNum += 2;
5716                         }
5717                         else if (format == CM_SURFACE_FORMAT_422H ||
5718                             format == CM_SURFACE_FORMAT_411P ||
5719                             format == CM_SURFACE_FORMAT_IMC3 ||
5720                             format == CM_SURFACE_FORMAT_422V ||
5721                             format == CM_SURFACE_FORMAT_444P)
5722                         {   // 3 planes surface
5723                             neededBTEntryNum += 3;
5724                         }
5725                         else
5726                         {
5727                             neededBTEntryNum += 1;
5728                         }
5729                         break;
5730 
5731                     default:
5732                         break;
5733                 }
5734             }
5735         }
5736     }
5737 
5738     if ((maxBTIndex + 1) > neededBTEntryNum)
5739     {
5740         neededBTEntryNum = maxBTIndex + 1;
5741     }
5742 
5743     //Wordaround: the calculation maybe not accurate if the VME surfaces are existed
5744     neededBTEntryNum += m_vmeSurfaceCount;
5745 
5746     return CM_SUCCESS;
5747 }
5748 
5749 //*-----------------------------------------------------------------------------
5750 //| Purpose:    Get aligned curbe size for different platforms
5751 //| Returns:    Result of operation.
5752 //*-----------------------------------------------------------------------------
GetAlignedCurbeSize(uint32_t value)5753 uint32_t CmKernelRT::GetAlignedCurbeSize(uint32_t value)
5754 {
5755     uint32_t curbeAlignedSize    = 0;
5756 
5757     curbeAlignedSize = MOS_ALIGN_CEIL(value, RENDERHAL_CURBE_BLOCK_ALIGN);
5758     return curbeAlignedSize;
5759 }
5760 
5761 #if CM_LOG_ON
Log()5762 std::string CmKernelRT::Log()
5763 {
5764 
5765     std::ostringstream  oss;
5766 
5767     oss << " Kernel Name:"         << m_kernelInfo->kernelName << std::endl
5768         << " Kernel Binary Size:"  << m_kernelInfo->jitBinarySize
5769         << " Index In Task:"       << m_indexInTask
5770         << " Thread Count:"        << m_threadCount
5771         << " Curbe Size:"          << m_sizeInCurbe
5772         << " Kernel arg Count:"    << m_argCount
5773         << std::endl;
5774 
5775      // Per Kernel Thread Space Log
5776     if(m_threadSpace)
5777     {
5778         oss << m_threadSpace->Log();
5779     }
5780 
5781     // Per Kernel Thread Group Space Log
5782     if(m_threadGroupSpace)
5783     {
5784         oss << m_threadGroupSpace->Log();
5785     }
5786 
5787     // Arguments Log
5788     for (uint32_t argIndex= 0; argIndex< m_argCount; argIndex++ )
5789     {
5790         if (m_args[argIndex].value) // filter out the implicit arguments
5791         {
5792             ArgLog(oss, argIndex, m_args[argIndex]);
5793         }
5794     }
5795 
5796     return oss.str();
5797 }
5798 
ArgLog(std::ostringstream & oss,uint32_t index,CM_ARG arg)5799 void CmKernelRT::ArgLog(std::ostringstream &oss, uint32_t index, CM_ARG arg)
5800 {
5801 
5802     oss << "[" << index << "] th Argument"
5803         << " Type :" << arg.unitKind
5804         << " Count:" << arg.unitCount
5805         << " Size:" << arg.unitSize
5806         << " Surface Kind:" << (int)arg.surfaceKind
5807         << " OffsetInPayload:" << arg.unitOffsetInPayload
5808         << " OffsetInPayloadOrig:" << arg.unitOffsetInPayloadOrig << "";
5809 
5810     CmLogger::LogDataArrayHex( oss, arg.value, arg.unitSize * arg.unitCount);
5811 
5812     if (CHECK_SURFACE_TYPE(arg.unitKind,
5813                            ARG_KIND_SURFACE_1D,
5814                            ARG_KIND_SURFACE_2D,
5815                            ARG_KIND_SURFACE_2D_UP,
5816                            ARG_KIND_SURFACE_VME,
5817                            ARG_KIND_SURFACE_SAMPLER,
5818                            ARG_KIND_SURFACE_3D,
5819                            ARG_KIND_SURFACE_SAMPLER8X8_AVS,
5820                            ARG_KIND_SURFACE_SAMPLER8X8_VA,
5821                            ARG_KIND_SURFACE2DUP_SAMPLER))
5822     {
5823         uint16_t numSurfaces = arg.unitSize / sizeof(uint32_t);
5824         if (arg.unitKind == ARG_KIND_SURFACE_VME)
5825         {
5826             numSurfaces = (arg.unitSize - sizeof(CM_HAL_VME_ARG_VALUE) * arg.unitVmeArraySize) / sizeof(uint32_t) + arg.unitVmeArraySize;
5827         }
5828         for (uint16_t i = 0; i < numSurfaces; i++)
5829         {
5830             uint32_t surfaceIndex = *(uint16_t *)(arg.surfIndex + i);
5831 
5832             if(surfaceIndex == CM_NULL_SURFACE)
5833                 continue;
5834 
5835             CmSurface *surf = nullptr;
5836             m_surfaceMgr->GetSurface(surfaceIndex, surf);
5837             if (surf == nullptr)
5838             {
5839                 continue;
5840             }
5841             surf->Log(oss);
5842         }
5843     }
5844 }
5845 
GetHalState()5846 CM_HAL_STATE* CmKernelRT::GetHalState() { return m_device->GetHalState(); }
5847 
5848 #endif  // #if CM_LOG_ON
5849 
SurfaceDump(uint32_t kernelNumber,int32_t taskId)5850 void CmKernelRT::SurfaceDump(uint32_t kernelNumber, int32_t taskId)
5851 {
5852 #if MDF_SURFACE_CONTENT_DUMP
5853     CM_ARG arg;
5854 
5855     for (uint32_t argIndex = 0; argIndex< m_argCount; argIndex++)
5856     {
5857         arg = m_args[argIndex];
5858         if (CHECK_SURFACE_TYPE(arg.unitKind,
5859             ARG_KIND_SURFACE_1D,
5860             ARG_KIND_SURFACE_2D,
5861             ARG_KIND_SURFACE_2D_UP,
5862             ARG_KIND_SURFACE_VME,
5863             ARG_KIND_SURFACE_SAMPLER,
5864             ARG_KIND_SURFACE_3D,
5865             ARG_KIND_SURFACE_SAMPLER8X8_AVS,
5866             ARG_KIND_SURFACE_SAMPLER8X8_VA,
5867             ARG_KIND_SURFACE2DUP_SAMPLER))
5868         {
5869             uint16_t numSurfaces = arg.unitSize / sizeof(uint32_t);
5870             if (arg.unitKind == ARG_KIND_SURFACE_VME)
5871             {
5872                 numSurfaces = (arg.unitSize - sizeof(CM_HAL_VME_ARG_VALUE) * arg.unitVmeArraySize) / sizeof(uint32_t) + arg.unitVmeArraySize;
5873             }
5874 
5875             for (uint16_t i = 0; i < numSurfaces; i++)
5876             {
5877                 uint32_t surfaceIndex = *(uint16_t *)(arg.surfIndex + i);
5878                 CmSurface *surf = nullptr;
5879                 m_surfaceMgr->GetSurface(surfaceIndex, surf);
5880                 if (surf == nullptr)
5881                 {
5882                     return;
5883                 }
5884                 surf->DumpContent(kernelNumber, m_kernelInfo->kernelName, taskId, argIndex, i);
5885             }
5886         }
5887     }
5888 #endif
5889 }
5890 
SetSamplerBTI(SamplerIndex * sampler,uint32_t nIndex)5891 CM_RT_API int32_t CmKernelRT::SetSamplerBTI(SamplerIndex* sampler, uint32_t nIndex)
5892 {
5893     if (!sampler)
5894     {
5895         return CM_NULL_POINTER;
5896     }
5897     if (CM_SAMPLER_MAX_BINDING_INDEX < nIndex)
5898     {
5899         return CM_KERNELPAYLOAD_SAMPLER_INVALID_BTINDEX;
5900     }
5901 
5902     uint32_t        samplerIndex   = sampler->get_data();
5903     PCM_HAL_STATE   cmHalState    = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5904 
5905     uint32_t i = 0;
5906     for (i = 0; i < m_samplerBtiCount; i++)
5907     {
5908         if ((m_samplerBtiEntry[i].samplerIndex == samplerIndex) && (m_samplerBtiEntry[i].samplerBTI == nIndex))
5909         {
5910             break;
5911         }
5912         if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
5913         {
5914             if ((m_samplerBtiEntry[i].samplerIndex != samplerIndex) && (m_samplerBtiEntry[i].samplerBTI == nIndex))
5915             {
5916                 if (cmHalState->useNewSamplerHeap)
5917                 {
5918                     SamplerParam sampler1 = {};
5919                     SamplerParam sampler2 = {};
5920                     cmHalState->cmHalInterface->GetSamplerParamInfoForSamplerType(&cmHalState->samplerTable[m_samplerBtiEntry[i].samplerIndex], sampler1);
5921                     cmHalState->cmHalInterface->GetSamplerParamInfoForSamplerType(&cmHalState->samplerTable[samplerIndex], sampler2);
5922 
5923                     if (sampler1.elementType== sampler2.elementType)
5924                     {
5925                         // return failure only if the two samplers have the same type, because different type samplers are able to set to the same BTI
5926                         return CM_FAILURE;
5927                     }
5928                 }
5929                 else
5930                 {
5931                     return CM_FAILURE;
5932                 }
5933             }
5934 
5935             CmSampler8x8State_RT *sampler8x8 = nullptr;
5936             CmSampler8x8State_RT *tmpSampler8x8 = nullptr;
5937             m_device->GetSampler8x8(samplerIndex, sampler8x8);
5938             m_device->GetSampler8x8(m_samplerBtiEntry[i].samplerIndex, tmpSampler8x8);
5939 
5940             if (sampler8x8 && tmpSampler8x8 && (sampler8x8->GetStateType() == CM_SAMPLER8X8_AVS)
5941                 && (tmpSampler8x8->GetStateType() == CM_SAMPLER8X8_AVS) &&
5942                 cmHalState->cmHalInterface->IsAdjacentSamplerIndexRequiredbyHw())
5943             {
5944                 if ((m_samplerBtiEntry[i].samplerIndex != samplerIndex) &&
5945                     ((m_samplerBtiEntry[i].samplerBTI == nIndex + 1) || (m_samplerBtiEntry[i].samplerBTI == nIndex - 1)))
5946                     return CM_FAILURE;
5947             }
5948         }
5949     }
5950 
5951     if (i >= CM_MAX_SAMPLER_TABLE_SIZE)
5952     {
5953         CM_ASSERTMESSAGE("Error: Exceed maximum sampler table size.");
5954         return CM_FAILURE;
5955     }
5956 
5957     if (i == m_samplerBtiCount)
5958     {
5959         m_samplerBtiEntry[i].samplerIndex = samplerIndex;
5960         m_samplerBtiEntry[i].samplerBTI = nIndex;
5961 
5962         m_samplerBtiCount = i + 1;
5963 
5964         m_dirty |= cMKERNELDATASAMPLERBTIDIRTY;
5965     }
5966     return CM_SUCCESS;
5967 }
5968 
GetBinary(std::vector<char> & binary)5969 CMRT_UMD_API int32_t CmKernelRT::GetBinary(std::vector<char>& binary)
5970 {
5971     binary.resize(m_binarySize);
5972 
5973     CmSafeMemCopy((void *)&binary[0], (void *)m_binary, m_binarySize);
5974 
5975     return CM_SUCCESS;
5976 }
5977 
ReplaceBinary(std::vector<char> & binary)5978 CMRT_UMD_API int32_t CmKernelRT::ReplaceBinary(std::vector<char>& binary)
5979 {
5980     uint32_t size = binary.size();
5981 
5982     if (size == 0)
5983     {
5984         return CM_INVALID_ARG_VALUE;
5985     }
5986 
5987     if(m_binaryOrig == nullptr)
5988     {
5989         //Store the orignal binary once.
5990         m_binaryOrig = m_binary;
5991         m_binarySizeOrig = m_binarySize;
5992     }
5993 
5994     m_binary = MOS_NewArray(char, size);
5995     CmSafeMemCopy((void *)m_binary, (void *)&binary[0], size);
5996 
5997     m_binarySize = size;
5998 
5999     return CM_SUCCESS;
6000 }
6001 
ResetBinary()6002 CMRT_UMD_API int32_t CmKernelRT::ResetBinary()
6003 {
6004     if (m_binaryOrig == nullptr)
6005     {
6006         //ReplaceBinary is never called
6007         return CM_SUCCESS;
6008     }
6009     if(m_binary!= m_binaryOrig)
6010     {
6011         MosSafeDeleteArray(m_binary);
6012     }
6013     m_binary = m_binaryOrig;
6014     m_binarySize = m_binarySizeOrig;
6015 
6016     return CM_SUCCESS;
6017 }
6018 
UpdateSamplerHeap(CmKernelData * kernelData)6019 int CmKernelRT::UpdateSamplerHeap(CmKernelData *kernelData)
6020 {
6021     // Get sampler bti & offset
6022     PCM_HAL_KERNEL_PARAM cmKernel = nullptr;
6023     PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
6024     PCM_HAL_STATE state = cmData->cmHalState;
6025     std::list<SamplerParam>::iterator iter;
6026     unsigned int heapOffset = 0;
6027 
6028     if (state->useNewSamplerHeap == false)
6029     {
6030         return CM_SUCCESS;
6031     }
6032 
6033     heapOffset = 0;
6034     cmKernel = kernelData->GetHalCmKernelData();
6035     std::list<SamplerParam> *sampler_heap = cmKernel->samplerHeap;
6036 
6037     // First pass, inserts sampler with user-defined BTI to the list. Sorts by element order low to high, then by BTI order low to high.
6038     for (unsigned int samplerElementType = MHW_Sampler1Element; samplerElementType < MHW_SamplerTotalElements; samplerElementType++)
6039     {
6040         for (unsigned int n = 0; n < cmKernel->samplerBTIParam.samplerCount; ++n)
6041         {
6042             SamplerParam sampler = {};
6043             sampler.samplerTableIndex = cmKernel->samplerBTIParam.samplerInfo[n].samplerIndex;
6044 
6045             if (state->samplerTable[sampler.samplerTableIndex].ElementType == samplerElementType)
6046             {
6047                 sampler.bti = cmKernel->samplerBTIParam.samplerInfo[n].samplerBTI;
6048                 sampler.userDefinedBti = true;
6049                 state->cmHalInterface->GetSamplerParamInfoForSamplerType(&state->samplerTable[sampler.samplerTableIndex], sampler);
6050 
6051                 // Guarantees each user-defined BTI has a spacing between each other user-defined BTIs larger than the stepping
6052                 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6053                 {
6054                     if (iter->elementType == sampler.elementType)
6055                     {
6056                         unsigned int diff = (iter->bti > sampler.bti) ? (iter->bti - sampler.bti) : (sampler.bti - iter->bti);
6057                         if (diff < sampler.btiStepping)
6058                         {
6059                             CM_ASSERTMESSAGE("Sampler BTI setting error. Confliction with other Sampler BTI.\n");
6060                             return MOS_STATUS_INVALID_PARAMETER;
6061                         }
6062                     }
6063                 }
6064 
6065                 // Inserts by the order
6066                 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6067                 {
6068                     if (iter->elementType > sampler.elementType)
6069                     {
6070                         break;
6071                     }
6072                     else if ((iter->elementType == sampler.elementType) && (iter->bti > sampler.bti))
6073                     {
6074                         break;
6075                     }
6076                 }
6077                 sampler.heapOffset = sampler.bti * sampler.btiMultiplier;
6078                 sampler_heap->insert(iter, sampler);
6079             }
6080         }
6081     }
6082 
6083     // Second pass, loops over all kernel/thread args, find regular sampler and insert to sampler heap.
6084     // Follows the existing sorted order.
6085     for (unsigned int samplerElementType = MHW_Sampler1Element; samplerElementType < MHW_SamplerTotalElements; samplerElementType++)
6086     {
6087         for (unsigned int index = 0; index < cmKernel->numArgs; index++)
6088         {
6089             PCM_HAL_KERNEL_ARG_PARAM argParam = &cmKernel->argParams[index];
6090             if (argParam->isNull)
6091             {
6092                 continue;
6093             }
6094 
6095             for (unsigned int threadIndex = 0; threadIndex < argParam->unitCount; threadIndex++)
6096             {
6097                 if (argParam->kind == CM_ARGUMENT_SAMPLER)
6098                 {
6099                     unsigned char *arg = argParam->firstValue + (threadIndex * argParam->unitSize);
6100                     unsigned int samplerTableIndex = *((uint32_t *)arg);
6101 
6102                     SamplerParam sampler = {};
6103                     sampler.samplerTableIndex = samplerTableIndex;
6104                     state->cmHalInterface->GetSamplerParamInfoForSamplerType(&state->samplerTable[sampler.samplerTableIndex], sampler);
6105                     sampler.regularBti = true;
6106 
6107                     if (sampler.elementType != samplerElementType)
6108                     {
6109                         continue;
6110                     }
6111 
6112                     // if the sampler is already in the heap, skip
6113                     bool isDuplicate = false;
6114                     for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6115                     {
6116                         if (iter->samplerTableIndex == sampler.samplerTableIndex)
6117                         {
6118                             isDuplicate = true;
6119                             iter->regularBti = true;
6120                             break;
6121                         }
6122                     }
6123                     if (isDuplicate == true)
6124                     {
6125                         continue;
6126                     }
6127 
6128                     // insert the new sampler to the heap
6129                     heapOffset = 0;
6130                     for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6131                     {
6132                         if (iter->elementType == sampler.elementType)
6133                         {
6134                             // Needs to keep the inserted sampler's correctness, so do not insert before same element regular sampler
6135                             // Only insert before user-defined BTI
6136                             if (iter->userDefinedBti == true)
6137                             {
6138                                 unsigned int curOffset = iter->heapOffset;
6139                                 if (heapOffset > curOffset)
6140                                 {
6141                                     // Confliction, which means that sampler heap in smaller
6142                                     // element type has excced the position which is supposed
6143                                     // to put this user-defined BTI sampler.
6144                                     // User needs to set the BTI to a larger value.
6145                                     CM_ASSERTMESSAGE("Sampler BTI setting error. Confliction with other Sampler BTI.\n");
6146                                     return MOS_STATUS_INVALID_PARAMETER;
6147                                 }
6148                                 else
6149                                 {
6150                                     if (curOffset - heapOffset >= sampler.btiStepping * sampler.btiMultiplier)
6151                                     {
6152                                         break;
6153                                     }
6154                                     else
6155                                     {
6156                                         heapOffset = curOffset + iter->btiStepping * iter->btiMultiplier;
6157                                     }
6158                                 }
6159                             }
6160                             else
6161                             {
6162                                 heapOffset += iter->btiStepping * iter->btiMultiplier;
6163                             }
6164                         }
6165                         else if (iter->elementType > sampler.elementType)
6166                         {
6167                             break;
6168                         }
6169                         else
6170                         {
6171                             heapOffset = iter->heapOffset + iter->size;
6172                             std::list<SamplerParam>::iterator iter_next = std::next(iter, 1);
6173                             if ((iter_next != sampler_heap->end()) && (iter_next->elementType > iter->elementType))
6174                             {
6175                                 // Aligns heapOffset to next nearest multiple of sampler size if next sampler is a different element type
6176                                 heapOffset = (heapOffset + iter_next->btiStepping * iter_next->btiMultiplier - 1) / (iter_next->btiStepping * iter_next->btiMultiplier) * (iter_next->btiStepping * iter_next->btiMultiplier);
6177                             }
6178                         }
6179                     }
6180 
6181                     if(!sampler.btiMultiplier)
6182                     {
6183                         CM_ASSERTMESSAGE("Sampler BTI setting error. Multiplier cannot be zero!\n");
6184                         return MOS_STATUS_INVALID_PARAMETER;
6185                     }
6186 
6187                     if (iter == sampler_heap->end())
6188                     {
6189                         // Aligns heapOffset to next nearest multiple of sampler size if next sampler is a different element type
6190                         heapOffset = (heapOffset + sampler.btiStepping * sampler.btiMultiplier - 1) / (sampler.btiStepping * sampler.btiMultiplier) * (sampler.btiStepping * sampler.btiMultiplier);
6191                     }
6192                     sampler.heapOffset = heapOffset;
6193                     sampler.bti = sampler.heapOffset / sampler.btiMultiplier;
6194                     sampler_heap->insert(iter, sampler);
6195                 }
6196             }
6197         }
6198     }
6199 
6200     return CM_SUCCESS;
6201 }
6202 }
6203