1 /*
2 * Copyright (c) 2007-2017, Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22 //!
23 //! \file cm_kernel_rt.cpp
24 //! \brief Contains CmKernelRT definitions.
25 //!
26
27 #include "cm_kernel_rt.h"
28
29 #include "cm_program.h"
30 #include "cm_device_rt.h"
31 #include "cm_surface_manager.h"
32 #include "cm_surface_2d_up_rt.h"
33 #include "cm_surface_3d_rt.h"
34 #include "cm_buffer_rt.h"
35 #include "cm_mov_inst.h"
36 #include "cm_kernel_data.h"
37 #include "cm_thread_space_rt.h"
38 #include "cm_state_buffer.h"
39 #include "cm_surface_vme.h"
40 #include "cm_debug.h"
41 #include "cm_surface_sampler8x8.h"
42 #include "cm_surface_sampler.h"
43 #include "cm_group_space.h"
44 #include "cm_surface_2d_rt.h"
45 #include "cm_sampler8x8_state_rt.h"
46 #include "cm_visa.h"
47 #include "cm_extension_creator.h"
48 #include "cm_execution_adv.h"
49
50 #define GENERATE_GLOBAL_SURFACE_INDEX
51
52 #define READ_FIELD_FROM_BUF( dst, type ) \
53 dst = *((type *) &buf[bytePosition]); \
54 bytePosition += sizeof(type);
55
56 #define PER_ARG_SIZE_IN_DWORD 3
57 #define KERNEL_INFO_SIZE_IN_DWORD 4
58
59 #define DW_ALIGNMENT( byte_address ) \
60 if( byte_address % 4 ) \
61 byte_address = ( byte_address / 4 + 1 ) * 4;
62
63 #define GRF_ALIGNMENT( byte_address ) \
64 if( byte_address % 32 ) \
65 byte_address = ( byte_address / 32 + 1 ) * 32;
66
67 // To check if surface type nType is equal to the surface type list in argument ...
68 #define CHECK_SURFACE_TYPE( nType, ... ) ( _CheckSurfaceType( nType, __VA_ARGS__, -1 ) )
69
70 #define IsKernelArg(arg) ((arg).unitCount == 1)
71
72 // Warning : x must be uint32_t
73 #define SET_MEMORY_OBJECT_CONTROL(x, memCtl) \
74 x = ((uint16_t)(memCtl.mem_ctrl<< 8 | memCtl.mem_type << 4 | memCtl.age)) << 16 | (x);
75
76 #define ADD_INTO_VME_INDEX_ARRAY(value) \
77 vmeIndexArray[vmeIndexArrayPosition] = value ; \
78 vmeIndexArrayPosition ++;
79
80 #define ADD_INTO_VME_CM_INDEX_ARRAY(value) ; \
81 vmeCmIndexArray[vmeCmIndexArrayPosition] = value ; \
82 vmeCmIndexArrayPosition ++;
83
84 typedef CM_ARG* PCM_ARG;
85
86 #define CM_KERNEL_DATA_CLEAN 0 // kernel data clean
87 #define CM_KERNEL_DATA_KERNEL_ARG_DIRTY 1 // per kernel arg dirty
88 #define CM_KERNEL_DATA_THREAD_ARG_DIRTY (1 << 1) // per thread arg dirty
89 #define CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY (1 << 2) // indirect payload data dirty
90 #define CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY (1 << 3) // indirect payload data size changes
91 #define CM_KERNEL_DATA_GLOBAL_SURFACE_DIRTY (1 << 4) // global surface dirty
92 #define CM_KERNEL_DATA_THREAD_COUNT_DIRTY (1 << 5) // thread count dirty, reset() be called
93 #define cMKERNELDATASAMPLERBTIDIRTY (1 << 6) // sampler bti dirty
94 #define CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY (1 << 7) // threadgroupspace dirty
95
Partition(PCM_ARG * args,int32_t p,int32_t r)96 int32_t Partition( PCM_ARG* args, int32_t p, int32_t r )
97 {
98 uint16_t x = args[p]->unitOffsetInPayload;
99 int32_t i = p - 1;
100 int32_t j = r + 1;
101 while( 1 )
102 {
103 do {
104 j --;
105 } while( args[j]->unitOffsetInPayload > x );
106
107 do {
108 i ++;
109 } while( args[i]->unitOffsetInPayload < x );
110
111 if( i < j )
112 {
113 PCM_ARG tmpP = args[i];
114 args[i] = args[j];
115 args[j] = tmpP;
116 }
117 else
118 {
119 return j;
120 }
121 }
122 }
123
124 // Cannot be called directly! use macro CHECK_SURFACE_TYPE!
_CheckSurfaceType(int nType,...)125 bool _CheckSurfaceType( int nType, ... )
126 {
127 bool match = false;
128 va_list ap;
129 va_start( ap, nType );
130 int type = 0;
131
132 while ( ( type = va_arg( ap, int ) ) >= 0 )
133 {
134 if( type == nType )
135 {
136 match = true;
137 break;
138 }
139 }
140 va_end(ap);
141
142 return match;
143 }
144
QuickSort(PCM_ARG * args,int32_t p,int32_t r)145 void QuickSort( PCM_ARG* args, int32_t p, int32_t r )
146 {
147 if( p < r )
148 {
149 int32_t q = Partition( args, p, r );
150 QuickSort( args, p, q );
151 QuickSort( args, q + 1, r );
152 }
153 }
154
155 namespace CMRT_UMD
156 {
157 static bool bCmMovInstRegistered = CmExtensionCreator<CmMovInstConstructor>::RegisterClass<CmMovInstConstructor>();
158 //*-----------------------------------------------------------------------------
159 //| Purpose: Create object for mov instructions
160 //| instructions will be copied into DstMem
161 //*-----------------------------------------------------------------------------
ConstructObjMovs(uint32_t dstOffset,uint32_t srcOffset,uint32_t size,CmDynamicArray & movInsts,uint32_t index,bool isBdw,bool isHwDebug)162 uint32_t CmMovInstConstructor::ConstructObjMovs(uint32_t dstOffset, uint32_t srcOffset, uint32_t size, CmDynamicArray &movInsts, uint32_t index, bool isBdw, bool isHwDebug)
163 {
164 return MovInst_RT::CreateMoves(dstOffset, srcOffset, size, movInsts, index, isBdw, isHwDebug);
165 }
166
167 //*-----------------------------------------------------------------------------
168 //| Purpose: Create CM Kernel
169 //| Arguments :
170 //| device [in] Pointer to device
171 //| program [in] Pointer to cm Program
172 //| kernelName [in] Name of kernel
173 //| kernelId [in] Kernel's ID
174 //| kernel [in/out] Reference Pointer to CM Kernel
175 //| options [in] jitter, or non-jitter
176 //| Returns: Result of the operation.
177 //*-----------------------------------------------------------------------------
Create(CmDeviceRT * device,CmProgramRT * program,const char * kernelName,uint32_t kernelIndex,uint32_t kernelSeqNum,CmKernelRT * & kernel,const char * options)178 int32_t CmKernelRT::Create(CmDeviceRT *device,
179 CmProgramRT *program,
180 const char *kernelName,
181 uint32_t kernelIndex,
182 uint32_t kernelSeqNum,
183 CmKernelRT* &kernel,
184 const char *options)
185 {
186 int32_t result = CM_SUCCESS;
187 CM_HAL_STATE * state = device ? ((PCM_CONTEXT_DATA)device->GetAccelData())->cmHalState : nullptr;
188
189 if (device)
190 {
191 if (state && state->advExecutor)
192 {
193 kernel = state->advExecutor->CreateKernelRT(device, program, kernelIndex, kernelSeqNum);
194 }
195 else
196 {
197 kernel = new (std::nothrow) CmKernelRT(device, program, kernelIndex, kernelSeqNum);
198 }
199 }
200
201 if( kernel )
202 {
203 if (device)
204 {
205 device->m_memObjectCount.kernelCount++;
206 }
207 kernel->Acquire();
208 result = kernel->Initialize( kernelName, options );
209 if( result != CM_SUCCESS )
210 {
211 CmKernelRT::Destroy( kernel, program);
212 return result;
213 }
214 }
215 else
216 {
217 CM_ASSERTMESSAGE("Error: Failed to create CmKernel due to out of system memory.");
218 return CM_OUT_OF_HOST_MEMORY;
219 }
220 if (options)
221 {
222 if (strcmp(options, "PredefinedGPUCopyKernel") == 0)
223 {
224 kernel->m_blCreatingGPUCopyKernel = true;
225 }
226 else
227 {
228 kernel->m_blCreatingGPUCopyKernel = false;
229 }
230 }
231
232 #if USE_EXTENSION_CODE
233 if (device)
234 result = kernel->InitForGTPin(device, program, kernel);
235 #endif
236
237 return result;
238 }
239
240 //*-----------------------------------------------------------------------------
241 //| Purpose: Destory Kernel
242 //| Returns: Result of the operation.
243 //*-----------------------------------------------------------------------------
Destroy(CmKernelRT * & kernel,CmProgramRT * & program)244 int32_t CmKernelRT::Destroy( CmKernelRT* &kernel, CmProgramRT *&program )
245 {
246 uint32_t refCount = kernel->SafeRelease();
247 if (refCount == 0)
248 {
249 kernel = nullptr;
250 }
251
252 refCount = program->SafeRelease();
253 if (refCount == 0)
254 {
255 program = nullptr;
256 }
257 return CM_SUCCESS;
258 }
259
260 //*-----------------------------------------------------------------------------
261 //| Purpose: Acuqire Kernel: increment refcount
262 //| Returns: Result of the operation.
263 //*-----------------------------------------------------------------------------
Acquire(void)264 int32_t CmKernelRT::Acquire( void)
265 {
266 m_refcount ++;
267 return m_refcount;
268 }
269
270 //*-----------------------------------------------------------------------------
271 //| Purpose: SafeRelease Kernel: Delete the instance
272 //| Returns: Result of the operation.
273 //*-----------------------------------------------------------------------------
SafeRelease(void)274 int32_t CmKernelRT::SafeRelease( void)
275 {
276 --m_refcount;
277 if (m_refcount == 0)
278 {
279 m_device->m_memObjectCount.kernelCount--;
280 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
281 PCM_HAL_STATE state = cmData->cmHalState;
282 if (state->dshEnabled)
283 {
284 state->pfnDSHUnregisterKernel(state, m_id);
285 }
286 delete this;
287 return 0;
288 }
289 return m_refcount;
290 }
291
292 //*-----------------------------------------------------------------------------
293 //| Purpose: Kernel constructor
294 //| Returns: Result of the operation.
295 //*-----------------------------------------------------------------------------
CmKernelRT(CmDeviceRT * device,CmProgramRT * program,uint32_t kernelIndex,uint32_t kernelSeqNum)296 CmKernelRT::CmKernelRT(CmDeviceRT *device,
297 CmProgramRT *program,
298 uint32_t kernelIndex,
299 uint32_t kernelSeqNum):
300 m_device( device ),
301 m_surfaceMgr( nullptr ),
302 m_program( program ),
303 m_options( nullptr ),
304 m_binary( nullptr ),
305 m_binaryOrig(nullptr),
306 m_binarySize(0),
307 m_binarySizeOrig(0),
308 m_threadCount( 0 ),
309 m_lastThreadCount( 0 ),
310 m_sizeInCurbe( 0 ),
311 m_sizeInPayload( 0 ),
312 m_argCount( 0 ),
313 m_args( nullptr ),
314 m_kernelInfo(nullptr),
315 m_kernelIndexInProgram( CM_INVALID_KERNEL_INDEX ),
316 m_curbeEnabled( true ),
317 m_nonstallingScoreboardEnabled(false),
318 m_dirty( CM_KERNEL_DATA_CLEAN ),
319 m_lastKernelData( nullptr ),
320 m_lastKernelDataSize( 0 ),
321 m_indexInTask(0),
322 m_threadSpaceAssociated(false),
323 m_perThreadArgExists(false),
324 m_perKernelArgExists( false ),
325 m_threadSpace( nullptr ),
326 m_adjustScoreboardY( 0 ),
327 m_lastAdjustScoreboardY( 0 ),
328 m_blCreatingGPUCopyKernel( false),
329 m_usKernelPayloadDataSize( 0 ),
330 m_kernelPayloadData( nullptr ),
331 m_usKernelPayloadSurfaceCount( 0 ),
332 m_samplerBtiCount( 0 ),
333 m_refcount(0),
334 m_halMaxValues( nullptr ),
335 m_halMaxValuesEx( nullptr ),
336 m_surfaceArray(nullptr),
337 m_threadGroupSpace( nullptr ),
338 m_vmeSurfaceCount( 0 ),
339 m_maxSurfaceIndexAllocated(0),
340 m_barrierMode(CM_LOCAL_BARRIER),
341 m_isClonedKernel(false),
342 m_cloneKernelID(0),
343 m_hasClones( false ),
344 m_stateBufferBounded( CM_STATE_BUFFER_NONE ),
345 m_movInstConstructor(nullptr)
346 {
347 program->Acquire();
348 m_program = program;
349
350 device->GetSurfaceManager(m_surfaceMgr);
351
352 m_id = kernelSeqNum; // Unique number for each kernel. This ID is used in Batch buffer.
353 m_id <<= 32;
354 m_kernelIndex = kernelIndex;
355
356 for (int i = 0; i < CM_GLOBAL_SURFACE_NUMBER; i++)
357 {
358 m_globalSurfaces[i] = nullptr;
359 m_globalCmIndex[i] = 0;
360 }
361
362 m_blhwDebugEnable = program->IsHwDebugEnabled();
363
364 CmSafeMemSet(m_pKernelPayloadSurfaceArray, 0, sizeof(m_pKernelPayloadSurfaceArray));
365 CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, sizeof(m_IndirectSurfaceInfoArray));
366 CmSafeMemSet( m_samplerBtiEntry, 0, sizeof( m_samplerBtiEntry ) );
367
368 if (m_samplerBtiCount > 0)
369 {
370 CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
371 m_samplerBtiCount = 0;
372 }
373
374 ResetKernelSurfaces();
375 }
376
377 //*-----------------------------------------------------------------------------
378 //| Purpose: Destructor of Class CmKernel
379 //| Returns: None.
380 //*-----------------------------------------------------------------------------
~CmKernelRT(void)381 CmKernelRT::~CmKernelRT( void )
382 {
383 MosSafeDeleteArray(m_options);
384
385 DestroyArgs();
386
387 if(m_lastKernelData)
388 {
389 CmKernelData::Destroy( m_lastKernelData );
390 }
391
392 if( m_device->CheckGTPinEnabled() && !m_blCreatingGPUCopyKernel)
393 {
394 MosSafeDeleteArray(m_binary);
395 }
396
397 if( CM_INVALID_KERNEL_INDEX != m_kernelIndexInProgram )
398 {
399 m_program->ReleaseKernelInfo(m_kernelIndexInProgram);
400 }
401
402 for(int i=0; i< CM_GLOBAL_SURFACE_NUMBER; i++)
403 {
404 SurfaceIndex *surfIndex = m_globalSurfaces[i];
405 MosSafeDelete(surfIndex);
406 }
407
408 MosSafeDeleteArray(m_kernelPayloadData);
409 MosSafeDeleteArray(m_surfaceArray);
410 MosSafeDelete(m_movInstConstructor);
411 }
412
413 //*-----------------------------------------------------------------------------
414 //| Purpose: Initialize CM kernel
415 //| Returns: Result of the operation.
416 //*-----------------------------------------------------------------------------
Initialize(const char * kernelName,const char * options)417 int32_t CmKernelRT::Initialize( const char* kernelName, const char* options )
418 {
419 if( kernelName == nullptr )
420 {
421 CM_ASSERTMESSAGE("Error: Kernel name is null.");
422 return CM_NULL_POINTER;
423 }
424
425 size_t length = strnlen( kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE );
426 if( length >= CM_MAX_KERNEL_NAME_SIZE_IN_BYTE )
427 {
428 CM_ASSERTMESSAGE("Error: Kernel name size is too long.");
429 return CM_FAILURE;
430 }
431
432 uint32_t kernelCount = 0;
433 m_program->GetKernelCount( kernelCount );
434
435 CM_KERNEL_INFO* kernelInfo = nullptr;
436 uint32_t i = 0;
437 for( i = 0; i < kernelCount; i ++ )
438 {
439 m_program->GetKernelInfo( i, kernelInfo );
440 if( !kernelInfo )
441 {
442 CM_ASSERTMESSAGE("Error: Invalid kernel info.");
443 return CM_NULL_POINTER;
444 }
445 if( strcmp( kernelInfo->kernelName, kernelName ) == 0 )
446 {
447 break;
448 }
449 }
450
451 if( i == kernelCount )
452 {
453 CM_ASSERTMESSAGE("Error: Invalid kernel count.");
454 return CM_FAILURE;
455 }
456
457 m_device->GetHalMaxValues( m_halMaxValues, m_halMaxValuesEx);
458
459 m_program->AcquireKernelInfo(i);
460 m_kernelInfo = kernelInfo;
461 m_kernelIndexInProgram = i;
462
463 if( options )
464 {
465 size_t length = strnlen( options, CM_MAX_OPTION_SIZE_IN_BYTE );
466 if(length >= CM_MAX_OPTION_SIZE_IN_BYTE)
467 {
468 CM_ASSERTMESSAGE("Error: Option string is too long.");
469 return CM_INVALID_ARG_VALUE;
470 }
471 else
472 {
473 m_options = MOS_NewArray(char, (length+1));
474 if( !m_options )
475 {
476 CM_ASSERTMESSAGE("Error: Out of system memory.");
477 return CM_OUT_OF_HOST_MEMORY;
478
479 }
480 CmSafeMemCopy( m_options, options, length);
481 m_options[ length ] = '\0';
482
483 char* tmp = strstr( m_options, "nocurbe" );
484 if( tmp )
485 {
486 m_curbeEnabled = false;
487 }
488 }
489 }
490
491 m_nonstallingScoreboardEnabled = true;
492
493 void* commonISACode = nullptr;
494 uint32_t commonISACodeSize = 0;
495 m_program->GetCommonISACode(commonISACode, commonISACodeSize);
496 if ((commonISACode == nullptr) || (commonISACodeSize <= 0))
497 {
498 CM_ASSERTMESSAGE("Error: Invalid VISA.");
499 return CM_INVALID_COMMON_ISA;
500 }
501
502 bool useVisaApi = true;
503 vISA::ISAfile *isaFile = nullptr;
504 vISA::KernelBody *kernelBody = nullptr;
505
506 auto getVersionAsInt = [](int major, int minor) { return major * 100 + minor; };
507 if (getVersionAsInt(m_program->m_cisaMajorVersion, m_program->m_cisaMinorVersion) < getVersionAsInt(3, 2))
508 {
509 useVisaApi = false;
510 }
511 else
512 {
513 isaFile = m_program->getISAfile();
514 if (!isaFile)
515 {
516 CM_ASSERTMESSAGE("Error: Invalid VISA.");
517 return CM_INVALID_COMMON_ISA;
518 }
519 kernelBody = isaFile->getKernelsData().at(m_kernelIndexInProgram);
520 }
521
522 uint8_t *buf = (uint8_t*)commonISACode;
523 uint32_t bytePosition = m_kernelInfo->kernelIsaOffset;
524
525 uint32_t kernelInfoRefCount = 0;
526 m_program->GetKernelInfoRefCount(m_kernelIndexInProgram, kernelInfoRefCount);
527 if (kernelInfoRefCount <= 2) //Only read for 1st time Kernel creation, later we reuse them
528 {
529 if (useVisaApi)
530 {
531 m_kernelInfo->globalStringCount = kernelBody->getStringCount();
532 }
533 {
534 READ_FIELD_FROM_BUF(m_kernelInfo->globalStringCount, unsigned short);
535 }
536
537 m_kernelInfo->globalStrings = (const char**) malloc( m_kernelInfo->globalStringCount * sizeof(char*) );
538 if(m_kernelInfo->globalStrings == nullptr)
539 {
540 CM_ASSERTMESSAGE("Error: Out of system memory.");
541 return CM_OUT_OF_HOST_MEMORY;
542 }
543 CmSafeMemSet(m_kernelInfo->globalStrings, 0, m_kernelInfo->globalStringCount * sizeof(char*) );
544
545 if (useVisaApi)
546 {
547 int i = 0;
548 for (vISA::StringPool *globalString : kernelBody->getStringPool())
549 {
550 size_t stringLength = std::strlen(globalString->getString());
551 char *string = (char*)malloc(stringLength + 1);
552 if (string == nullptr)
553 {
554 CM_ASSERTMESSAGE("Error: Out of system memory.");
555 return CM_OUT_OF_HOST_MEMORY;
556 }
557 CmSafeMemCopy(string, globalString->getString(), stringLength);
558 string[stringLength] = '\0';
559 m_kernelInfo->globalStrings[i] = string;
560 i++;
561 }
562 }
563 else
564 {
565 for (int i = 0; i < (int)m_kernelInfo->globalStringCount; i++)
566 {
567 char* string = (char*)malloc(CM_MAX_KERNEL_STRING_IN_BYTE + 1);
568 if (string == nullptr)
569 {
570 CM_ASSERTMESSAGE("Error: Out of system memory.");
571 return CM_OUT_OF_HOST_MEMORY;
572 }
573 int j = 0;
574 while (buf[bytePosition] != '\0' && j < CM_MAX_KERNEL_STRING_IN_BYTE) {
575 string[j++] = buf[bytePosition++];
576 }
577 string[j] = '\0';
578 bytePosition++;
579 m_kernelInfo->globalStrings[i] = string;
580 }
581 }
582 }
583
584 uint32_t count = 0;
585 if (useVisaApi)
586 {
587 count = kernelBody->getNumInputs();
588 }
589 else
590 {
591 bytePosition = m_kernelInfo->inputCountOffset;
592
593 uint8_t countTemp = 0;
594 READ_FIELD_FROM_BUF(countTemp, uint8_t);
595 count = countTemp;
596 }
597
598 if( count > m_halMaxValues->maxArgsPerKernel )
599 {
600 CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
601 return CM_EXCEED_KERNEL_ARG_AMOUNT;
602 }
603
604 m_args = MOS_NewArray(CM_ARG, count);
605 if( (!m_args) && (count != 0) )
606 {
607 CM_ASSERTMESSAGE("Error: Out of system memory.");
608 MosSafeDeleteArray(m_options);
609 return CM_OUT_OF_HOST_MEMORY;
610 }
611 CmSafeMemSet(m_args, 0, sizeof(CM_ARG) * count);
612 m_argCount = count;
613
614 uint32_t preDefinedSurfNum;
615 if ( (m_program->m_cisaMajorVersion > 3) || ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion >=1)) ) //CISA 3.1 +
616 {
617 preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_3_1;
618 }
619 else if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion == 0))
620 {
621 preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_2_1;
622 }
623 else //CISA 2.0
624 {
625 preDefinedSurfNum = COMMON_ISA_NUM_PREDEFINED_SURF_VER_2;
626 }
627
628 uint32_t argSize = 0;
629
630 for (uint32_t i = 0; i < m_argCount; i++)
631 {
632 vISA::InputInfo *inputInfo = nullptr;
633 uint8_t kind = 0;
634
635 if (useVisaApi)
636 {
637 inputInfo = kernelBody->getInputInfo()[i];
638 kind = inputInfo->getKind();
639 }
640 else
641 {
642 READ_FIELD_FROM_BUF(kind, uint8_t);
643 }
644
645 if (kind == 0x2) // compiler value for surface
646 {
647 kind = ARG_KIND_SURFACE;
648 // runtime value for surface. surface will be further classified to 1D/2D/3D
649 }
650 else if (kind == 0x3) // compiler value for vme index
651 {
652 kind = ARG_KIND_VME_INDEX;
653 }
654 else if (kind == 0x8)
655 {
656 kind = ARG_KIND_IMPLICT_LOCALSIZE;
657 m_args[i].isSet = true;
658 m_args[i].unitCount = 1;
659 }
660 else if (kind == 0x10) {
661 kind = ARG_KIND_IMPLICT_GROUPSIZE;
662 m_args[i].isSet = true;
663 m_args[i].unitCount = 1;
664 }
665 else if (kind == 0x18) {
666 kind = ARG_KIND_IMPLICIT_LOCALID;
667 m_args[i].isSet = true;
668 m_args[i].unitCount = 1;
669 m_perKernelArgExists = true; //only VISA3.3+, can come here, so, no matter it is there any explicit arg, implicit arg exits
670 }
671 else if (kind == 0x2A) {
672 kind = ARG_KIND_SURFACE_2D_SCOREBOARD;
673 }
674 else if (kind == 0x20) {
675 kind = ARG_KIND_GENERAL_DEPVEC;
676 }
677 else if (kind == 0x30) {
678 kind = ARG_KIND_GENERAL_DEPCNT;
679 }
680 else if (kind == 0x80) {
681 // IMP_PSEUDO_INPUT = 0x80 is pseudo input. All inputs after this
682 // will be ignored by CMRT without checking and payload copied.
683 // This resizes the argument count to achieve this.
684 m_argCount = i;
685 break;
686 }
687
688 m_args[i].unitKind = kind;
689 m_args[i].unitKindOrig = kind;
690
691 if (kind == ARG_KIND_SURFACE && m_kernelInfo->surfaceCount)
692 {
693 m_args[i].surfaceKind = DATA_PORT_SURF;
694 }
695
696 if (useVisaApi)
697 {
698 m_args[i].unitOffsetInPayload = inputInfo->getOffset();
699 m_args[i].unitOffsetInPayloadOrig = inputInfo->getOffset();
700
701 m_args[i].unitSize = inputInfo->getSize();
702 m_args[i].unitSizeOrig = inputInfo->getSize();
703 }
704 else
705 {
706 uint32_t varID;
707 READ_FIELD_FROM_BUF(varID, uint16_t);
708
709 uint16_t tmpW;
710 READ_FIELD_FROM_BUF(tmpW, uint16_t);
711 m_args[i].unitOffsetInPayload = tmpW;
712 m_args[i].unitOffsetInPayloadOrig = tmpW;
713
714 READ_FIELD_FROM_BUF(tmpW, uint16_t);
715 m_args[i].unitSize = tmpW;
716 m_args[i].unitSizeOrig = tmpW;
717 }
718
719 argSize += m_args[i].unitSize;
720 }
721 //////////////////////////////////////////////////////////////////////////
722
723 if (kernelInfoRefCount <= 2) //Only read for 1st time Kernel creation, later we reuse them
724 {
725 uint16_t attributeCount = 0;
726 if (useVisaApi)
727 {
728 attributeCount = kernelBody->getAttributeCount();
729 }
730 else
731 {
732 /////////////////////////////////////////////////////////////////////////
733 // Get pre-defined kernel attributes, Start
734 //skipping size and entry
735 bytePosition += 8;
736
737 READ_FIELD_FROM_BUF(attributeCount, uint16_t);
738 }
739
740 for (int i = 0; i < attributeCount; i++)
741 {
742 vISA::AttributeInfo *attribute = nullptr;
743 uint32_t nameIndex = 0;
744 uint8_t size = 0;
745
746 if (useVisaApi)
747 {
748 attribute = kernelBody->getAttributeInfo()[i];
749 nameIndex = attribute->getName();
750 size = attribute->getSize();
751 }
752 else
753 {
754 READ_FIELD_FROM_BUF(nameIndex, uint16_t);
755 READ_FIELD_FROM_BUF(size, uint8_t);
756 }
757
758 if( strcmp( m_kernelInfo->globalStrings[nameIndex], "AsmName" ) == 0 )
759 {
760 if (useVisaApi)
761 {
762 CmSafeMemCopy(m_kernelInfo->kernelASMName, attribute->getValue(), size);
763 }
764 else
765 {
766 CmSafeMemCopy(m_kernelInfo->kernelASMName, &buf[bytePosition], size);
767 bytePosition += size;
768 }
769 }
770 else if (strcmp( m_kernelInfo->globalStrings[nameIndex], "SLMSize" ) == 0)
771 {
772 if (useVisaApi)
773 {
774 m_kernelInfo->kernelSLMSize = attribute->getValue()[0];
775 }
776 else
777 {
778 READ_FIELD_FROM_BUF(m_kernelInfo->kernelSLMSize, uint8_t);
779 }
780
781 /* Notes by Stony@2014-04-09
782 * <=CISA3.1: the size is number of 4KB
783 * > CISA3.1: the size is number of 1KB
784 * Here convert it to the number of 1KB if <=CISA 3.1
785 */
786 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion <= 1))
787 {
788 m_kernelInfo->kernelSLMSize = m_kernelInfo->kernelSLMSize * 4;
789 }
790
791 // align to power of 2
792 uint32_t v = m_kernelInfo->kernelSLMSize;
793 v--;
794 v |= v >> 1;
795 v |= v >> 2;
796 v |= v >> 4;
797 v |= v >> 8;
798 v |= v >> 16;
799 v++;
800 m_kernelInfo->kernelSLMSize = ( uint8_t )v;
801 }
802 else if (strcmp(m_kernelInfo->globalStrings[nameIndex], "NoBarrier") == 0)
803 {
804 m_kernelInfo->blNoBarrier = true;
805 if (!useVisaApi)
806 {
807 bytePosition += size;
808 }
809 }
810 else
811 {
812 if (!useVisaApi)
813 {
814 bytePosition += size;
815 }
816 }
817 }
818 if (m_kernelInfo->blNoBarrier && m_options && strstr(m_options, "-hasBarrier"))
819 {
820 m_kernelInfo->blNoBarrier = false;
821 }
822 }
823
824 if(argSize > m_halMaxValues->maxArgByteSizePerKernel)
825 {
826 CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
827 return CM_EXCEED_KERNEL_ARG_SIZE_IN_BYTE;
828 }
829
830 buf = (uint8_t*)commonISACode;
831
832 if(m_program->IsJitterEnabled())
833 {
834 //m_JitterEnable = true;
835 char *programOptions;
836 m_program->GetKernelOptions(programOptions);
837 //if no options or same options, copy load program's binary. else re-jitter
838 {
839 m_binary = (char *)m_kernelInfo->jitBinaryCode;
840 m_binarySize = m_kernelInfo->jitBinarySize;
841 m_kernelInfo->origBinary = m_kernelInfo->jitBinaryCode;
842 m_kernelInfo->origBinarySize = m_kernelInfo->jitBinarySize;
843 }
844 }
845 else
846 {
847 char* binary = (char*)(buf + m_kernelInfo->genxBinaryOffset );
848
849 //No copy, point to the binary offset in CISA code.
850 m_binary = binary;
851 m_binarySize = m_kernelInfo->genxBinarySize;
852
853 m_kernelInfo->origBinary = binary;
854 m_kernelInfo->origBinarySize = m_kernelInfo->genxBinarySize;
855 }
856
857 if (m_kernelInfo->blNoBarrier)
858 {
859 m_barrierMode = CM_NO_BARRIER;
860 }
861
862 m_movInstConstructor = CmExtensionCreator<CmMovInstConstructor>::CreateClass();
863 if (m_movInstConstructor == nullptr)
864 {
865 CM_ASSERTMESSAGE("Error: Failed to allocate movInstConstructor due to out of system memory.");
866 return CM_OUT_OF_HOST_MEMORY;
867 }
868
869 CmNotifierGroup *notifiers = m_device->GetNotifiers();
870 if (notifiers != nullptr)
871 {
872 notifiers->NotifyKernelCreated(this);
873 }
874
875 return CM_SUCCESS;
876 }
877
878 //*-----------------------------------------------------------------------------
879 //! A CmKernel can run in multiple threads concurrently. This
880 //! fucntion is to set the number of threads.
881 //! INPUT:
882 //! number of threads
883 //! OUTPUT:
884 //! CM_SUCCESS or
885 //! CM_INVALID_ARG_VALUE if the number is larger than CmKernel's capacity
886 //*-----------------------------------------------------------------------------
SetThreadCount(uint32_t count)887 CM_RT_API int32_t CmKernelRT::SetThreadCount(uint32_t count )
888 {
889 INSERT_API_CALL_LOG(GetHalState());
890 // Check per kernel, per task check will be at enqueue time
891 if ((int)count <= 0)
892 return CM_INVALID_ARG_VALUE;
893
894 if (m_threadSpace == nullptr)
895 {
896 if (m_threadCount)
897 {
898 // Setting threadCount twice with different values will cause reset of kernels
899 if (m_threadCount != count)
900 {
901 Reset();
902 m_threadCount = count;
903 m_dirty |= CM_KERNEL_DATA_THREAD_COUNT_DIRTY;
904 }
905 }
906 else // first time
907 {
908 m_threadCount = count;
909 }
910 }
911 return CM_SUCCESS;
912 }
913
GetThreadCount(uint32_t & count)914 int32_t CmKernelRT::GetThreadCount(uint32_t& count )
915 {
916 count = m_threadCount;
917 return CM_SUCCESS;
918 }
919
GetKernelSurfaces(bool * & surfArray)920 int32_t CmKernelRT::GetKernelSurfaces(bool *&surfArray)
921 {
922 surfArray = m_surfaceArray;
923 return CM_SUCCESS;
924 }
925
ResetKernelSurfaces()926 int32_t CmKernelRT::ResetKernelSurfaces()
927 {
928 uint32_t surfacePoolSize = m_surfaceMgr->GetSurfacePoolSize();
929 if (!m_surfaceArray)
930 {
931 m_surfaceArray = MOS_NewArray(bool, surfacePoolSize);
932 if (!m_surfaceArray)
933 {
934 CM_ASSERTMESSAGE("Error: Failed to rest kernel surfaces due to out of system memory.");
935 return CM_OUT_OF_HOST_MEMORY;
936 }
937 }
938 CmSafeMemSet( m_surfaceArray, 0, surfacePoolSize * sizeof( bool ) );
939
940 return CM_SUCCESS;
941 }
942
943 //*-----------------------------------------------------------------------------
944 //| Purpose: Get CmSurface from surface manager.
945 //| Use "value + indexSurfaceArray" to locate its surfaceIndex
946 //| Returns: CmSurface. Null if not found
947 //*-----------------------------------------------------------------------------
GetSurfaceFromSurfaceArray(SurfaceIndex * value,uint32_t indexSurfaceArray)948 CmSurface* CmKernelRT::GetSurfaceFromSurfaceArray( SurfaceIndex* value, uint32_t indexSurfaceArray)
949 {
950 int32_t hr = CM_SUCCESS;
951 CmSurface *surface = nullptr;
952 SurfaceIndex* surfaceIndex = nullptr;
953
954 surfaceIndex = value + indexSurfaceArray;
955 CM_CHK_NULL_GOTOFINISH_CMERROR(surfaceIndex);
956
957 if (surfaceIndex->get_data() == CM_NULL_SURFACE
958 || surfaceIndex->get_data() == 0)
959 {
960 surface = (CmSurface *)CM_NULL_SURFACE;
961 goto finish;
962 }
963
964 m_surfaceMgr->GetSurface(surfaceIndex->get_data(), surface);
965
966 finish:
967 return surface;
968 }
969
970 //*-----------------------------------------------------------------------------
971 //| Purpose: Set kernel arg for single vme surface or multiple vme surfaces
972 //| in surface array. So far, don't support vme surface array in thread arg.
973 //| Returns: Result of the operation.
974 //*-----------------------------------------------------------------------------
SetArgsVme(CM_KERNEL_INTERNAL_ARG_TYPE nArgType,uint32_t argIndex,const void * value,uint32_t nThreadID)975 int32_t CmKernelRT::SetArgsVme(CM_KERNEL_INTERNAL_ARG_TYPE nArgType, uint32_t argIndex, const void *value, uint32_t nThreadID)
976 {
977 uint32_t elementNum = 0;
978 CM_ARG& arg = m_args[ argIndex ];
979 uint32_t totalVmeArgValueSize = 0;
980 uint32_t totalSurfacesInVme = 0;
981 uint32_t tempVmeArgValueSize = 0;
982 uint32_t vmeArgValueOffset = 0;
983 uint32_t lastVmeSurfCount = 0;
984 CmSurfaceVme* surfVme = nullptr;
985 uint8_t *vmeArgValueArray = nullptr;
986 uint16_t *vmeCmIndexArray = nullptr;
987 int32_t hr = CM_SUCCESS;
988
989 //Get Number of elements in surface array
990 if (arg.unitVmeArraySize == 0)
991 { //First Time
992 elementNum = arg.unitSize / sizeof(uint32_t);
993 }
994 else
995 {
996 elementNum = arg.unitVmeArraySize;
997 }
998
999 //Get Size of vmeIndexArray and vmeCmIndexArray.
1000 for(uint32_t i=0; i< elementNum; i++)
1001 {
1002 if (((SurfaceIndex*)(value)+i)->get_data() == 0 || ((SurfaceIndex*)(value)+i)->get_data() == CM_NULL_SURFACE)
1003 {
1004 tempVmeArgValueSize = sizeof(CM_HAL_VME_ARG_VALUE);
1005 totalVmeArgValueSize += tempVmeArgValueSize;
1006 totalSurfacesInVme++;
1007 }
1008 else
1009 {
1010 surfVme = static_cast<CmSurfaceVme*>(GetSurfaceFromSurfaceArray((SurfaceIndex*)value, i));
1011 CM_CHK_NULL_GOTOFINISH_CMERROR(surfVme);
1012 tempVmeArgValueSize = surfVme->GetVmeCmArgSize();
1013 totalVmeArgValueSize += tempVmeArgValueSize;
1014 totalSurfacesInVme += surfVme->GetTotalSurfacesCount();
1015 }
1016 }
1017
1018 // Allocate and Zero Memory for arg.pValue and arg.surfIndex
1019 // arg.pValue : an array of CM_HAL_VME_ARG_VALUE structure followed by an array of reference surfaces
1020 // arg.surfIndex : an array listing all the Cm surface indexes, in the order of current, fw surfaces, bw surfaces
1021
1022 if (arg.unitSize < totalVmeArgValueSize) // need to re-allocate larger area)
1023 {
1024 if (arg.value)
1025 {
1026 MosSafeDeleteArray(arg.value);
1027 }
1028 arg.value = MOS_NewArray(uint8_t, totalVmeArgValueSize);
1029
1030 if (arg.surfIndex)
1031 {
1032 MosSafeDeleteArray(arg.surfIndex);
1033 }
1034 arg.surfIndex = MOS_NewArray(uint16_t, totalSurfacesInVme);
1035 }
1036
1037 CM_CHK_NULL_GOTOFINISH_CMERROR(arg.value);
1038 CmSafeMemSet(arg.value, 0, totalVmeArgValueSize);
1039 CM_CHK_NULL_GOTOFINISH_CMERROR(arg.surfIndex);
1040 CmSafeMemSet(arg.surfIndex, 0, totalSurfacesInVme * sizeof(uint16_t));
1041
1042 //Set each Vme Surface
1043 for (uint32_t i = 0; i< elementNum; i++)
1044 {
1045 if (((SurfaceIndex*)(value)+i)->get_data() == 0 || ((SurfaceIndex*)(value)+i)->get_data() == CM_NULL_SURFACE)
1046 {
1047 PCM_HAL_VME_ARG_VALUE vmeArg = (PCM_HAL_VME_ARG_VALUE)(arg.value + vmeArgValueOffset);
1048 vmeArg->fwRefNum = 0;
1049 vmeArg->bwRefNum = 0;
1050 vmeArg->curSurface = CM_NULL_SURFACE;
1051 tempVmeArgValueSize = sizeof(CM_HAL_VME_ARG_VALUE);
1052 vmeArgValueOffset += tempVmeArgValueSize;
1053 arg.surfIndex[lastVmeSurfCount] = CM_NULL_SURFACE;
1054 lastVmeSurfCount++;
1055 }
1056 else
1057 {
1058 surfVme = static_cast<CmSurfaceVme*>(GetSurfaceFromSurfaceArray((SurfaceIndex*)value, i));
1059 CM_CHK_NULL_GOTOFINISH_CMERROR(surfVme);
1060 SetArgsSingleVme(surfVme, arg.value + vmeArgValueOffset, arg.surfIndex + lastVmeSurfCount);
1061 tempVmeArgValueSize = surfVme->GetVmeCmArgSize();
1062 vmeArgValueOffset += tempVmeArgValueSize;
1063 lastVmeSurfCount += surfVme->GetTotalSurfacesCount();
1064 }
1065 }
1066
1067 if ( nArgType == CM_KERNEL_INTERNEL_ARG_PERKERNEL ) // per kernel arg
1068 {
1069 // First time set
1070 if( !arg.value )
1071 { // Increment size kernel arguments will take up in CURBE
1072 m_sizeInCurbe += CM_ARGUMENT_SURFACE_SIZE * elementNum;
1073 }
1074
1075 arg.unitCount = 1;
1076 arg.isDirty = true;
1077 arg.isSet = true;
1078 arg.unitKind = ARG_KIND_SURFACE_VME;
1079 arg.unitSize = (uint16_t)totalVmeArgValueSize; // the unitSize can't represent surfaces count here
1080 arg.unitVmeArraySize = elementNum;
1081
1082 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1083 m_perKernelArgExists = true;
1084 }
1085 else
1086 {
1087 // Thread arg doesn't support VME surfaces as it is rarely used and it is complex to implement,
1088 // since each thread may has different surface number in its vme surface argment.
1089 hr = CM_THREAD_ARG_NOT_ALLOWED;
1090 }
1091
1092 finish:
1093 if(hr != CM_SUCCESS)
1094 {
1095 MosSafeDeleteArray(arg.value);
1096 MosSafeDeleteArray(arg.surfIndex);
1097 }
1098 return hr;
1099
1100 }
1101
1102 //*-----------------------------------------------------------------------------
1103 //| Purpose: Fill arg for a single vme surface.
1104 //| vmeIndexArray points to arg.pValue
1105 //| vmeCmIndexArray points to arg.surfIndex
1106 //| Returns: Result of the operation.
1107 //*-----------------------------------------------------------------------------
SetArgsSingleVme(CmSurfaceVme * vmeSurface,uint8_t * vmeArgValueArray,uint16_t * cmSufacesArray)1108 int32_t CmKernelRT::SetArgsSingleVme(CmSurfaceVme* vmeSurface, uint8_t *vmeArgValueArray, uint16_t *cmSufacesArray)
1109 {
1110
1111 int32_t hr = CM_SUCCESS;
1112 CM_SURFACE_MEM_OBJ_CTRL memCtl;
1113 uint32_t vmeBackwardSurfaceCount = 0;
1114 uint32_t vmeForwardSurfaceCount = 0;
1115 uint32_t vmeCurrentSurfaceIndex = 0;
1116 uint16_t vmeCurrentCmIndex = 0;
1117 int32_t vmeIndexArrayPosition = 0; // Offset for vmeIndexArray
1118 int32_t vmeCmIndexArrayPosition = 0; // Offset for vmeCmIndexArray
1119 uint32_t tempOutput = 0;
1120 uint32_t cmSurfArrayIdx = 0;
1121 uint32_t surfStateWidth = 0;
1122 uint32_t surfStateHeight = 0;
1123
1124 uint32_t *fArray = nullptr;
1125 uint32_t *bArray = nullptr;
1126 uint32_t *fCmIndex = nullptr;
1127 uint32_t *bCmIndex = nullptr;
1128
1129 uint32_t *fwSurfInArg = nullptr;
1130 uint32_t *bwSurfInArg = nullptr;
1131
1132 CmSurface *surface = nullptr;
1133 PCM_HAL_VME_ARG_VALUE vmeArg = (PCM_HAL_VME_ARG_VALUE)vmeArgValueArray;
1134
1135 CM_CHK_NULL_GOTOFINISH_CMERROR(vmeSurface);
1136 CM_CHK_NULL_GOTOFINISH_CMERROR(vmeArg);
1137 CM_CHK_NULL_GOTOFINISH_CMERROR(cmSufacesArray);
1138
1139 if(vmeSurface == (CmSurfaceVme *)CM_NULL_SURFACE)
1140 {
1141 vmeArg->fwRefNum = 0;
1142 vmeArg->bwRefNum = 0;
1143 vmeArg->curSurface = CM_NULL_SURFACE;
1144 cmSufacesArray[cmSurfArrayIdx] = CM_NULL_SURFACE;
1145 return hr;
1146 }
1147
1148 // Get Vme Backward Forward Surface Count
1149 vmeSurface->GetIndexBackwardCount(vmeBackwardSurfaceCount);
1150 vmeSurface->GetIndexForwardCount(vmeForwardSurfaceCount);
1151
1152 vmeArg->fwRefNum = vmeForwardSurfaceCount;
1153 vmeArg->bwRefNum = vmeBackwardSurfaceCount; // these two numbers must be set before any other operations
1154
1155 vmeSurface->GetSurfaceStateResolution(vmeArg->surfStateParam.surfaceStateWidth, vmeArg->surfStateParam.surfaceStateHeight);
1156
1157 vmeSurface->GetIndexForwardArray(fArray);
1158 vmeSurface->GetIndexBackwardArray(bArray);
1159 vmeSurface->GetIndexCurrent(vmeCurrentSurfaceIndex);
1160
1161 vmeSurface->GetCmIndexCurrent(vmeCurrentCmIndex);
1162 vmeSurface->GetCmIndexForwardArray(fCmIndex);
1163 vmeSurface->GetCmIndexBackwardArray(bCmIndex);
1164
1165 cmSufacesArray[cmSurfArrayIdx++] = vmeCurrentCmIndex;
1166
1167 // Set Current Vme Surface
1168 m_surfaceMgr->GetSurface(vmeCurrentCmIndex, surface);
1169 CM_CHK_NULL_GOTOFINISH_CMERROR(surface);
1170
1171 vmeArg->curSurface = vmeCurrentSurfaceIndex;
1172
1173 //Set Forward Vme Surfaces
1174 fwSurfInArg = findFwRefInVmeArg(vmeArg);
1175 for (uint32_t i = 0; i < vmeForwardSurfaceCount; i++)
1176 {
1177 GetVmeSurfaceIndex( fArray, fCmIndex, i, &tempOutput);
1178 fwSurfInArg[i] = tempOutput;
1179 cmSufacesArray[cmSurfArrayIdx++] = (uint16_t)fCmIndex[i];
1180 }
1181
1182 //Set Backward Vme Surfaces
1183 bwSurfInArg = findBwRefInVmeArg(vmeArg);
1184 for (uint32_t i = 0; i < vmeBackwardSurfaceCount; i++)
1185 {
1186 GetVmeSurfaceIndex( bArray, bCmIndex, i, &tempOutput);
1187 bwSurfInArg[i] = tempOutput;
1188 cmSufacesArray[cmSurfArrayIdx++] = (uint16_t)bCmIndex[i];
1189 }
1190
1191 finish:
1192 return hr;
1193 }
1194
1195 //*-----------------------------------------------------------------------------
1196 //| Purpose: Get Vme Surface Index with memory object setting .
1197 //| Output value will be filled into arg.pValue
1198 //| Returns: Result of the operation.
1199 //*-----------------------------------------------------------------------------
GetVmeSurfaceIndex(uint32_t * vmeIndexArray,uint32_t * vmeCmIndexArray,uint32_t index,uint32_t * outputValue)1200 int32_t CmKernelRT::GetVmeSurfaceIndex(
1201 uint32_t *vmeIndexArray,
1202 uint32_t *vmeCmIndexArray,
1203 uint32_t index,
1204 uint32_t *outputValue)
1205 {
1206 int32_t hr = CM_SUCCESS;
1207 uint32_t value = vmeIndexArray[index];
1208
1209 if (vmeIndexArray[index] == CM_INVALID_VME_SURFACE)
1210 {
1211 value = CM_NULL_SURFACE;
1212 }
1213
1214 *outputValue = value;
1215
1216 return hr;
1217 }
1218
1219 //*-----------------------------------------------------------------------------
1220 //| Purpose: Set arguments for function SetKernelArg().
1221 //| Kernel argument is surface array.
1222 //! INPUT:
1223 //! 1) Current index in surface array
1224 //! 2) Index of kernel argument
1225 //! 3) Surface count in surface array
1226 //! 4) Pointer to current surface in surface array.
1227 //! 5) Current surface index
1228 //! 6) Pointer to argument value
1229 //! 7) value of surface handle combined with memory object control
1230 //! 8) Original surface index for each surface in array
1231 //| Returns: Result of the operation.
1232 //*-----------------------------------------------------------------------------
SetArgsInternalSurfArray(int32_t offset,uint32_t kernelArgIndex,int32_t surfCount,CmSurface * currentSurface,uint32_t currentSurfIndex,SurfaceIndex * value,uint32_t surfValue[],uint16_t origSurfIndex[])1233 int32_t CmKernelRT::SetArgsInternalSurfArray(
1234 int32_t offset,uint32_t kernelArgIndex,
1235 int32_t surfCount, CmSurface* currentSurface,
1236 uint32_t currentSurfIndex, SurfaceIndex* value,
1237 uint32_t surfValue[], uint16_t origSurfIndex[])
1238 {
1239 CM_SURFACE_MEM_OBJ_CTRL memCtl;
1240 uint32_t surfRegTableIndex = 0;
1241 uint32_t handle = 0;
1242 uint32_t samplerIndex;
1243 uint16_t samplerCmIndex;
1244 uint32_t surfaceArraySize = 0;
1245
1246 m_surfaceMgr->GetSurfaceArraySize(surfaceArraySize);
1247 MosSafeDeleteArray(m_args[kernelArgIndex].surfArrayArg); // delete it if it was allcated
1248 m_args[kernelArgIndex].surfArrayArg = MOS_NewArray(SURFACE_ARRAY_ARG, surfCount);
1249 if (!m_args[kernelArgIndex].surfArrayArg)
1250 {
1251 CM_ASSERTMESSAGE("Error: Out of system memory.");
1252 return CM_OUT_OF_HOST_MEMORY;
1253 }
1254 CmSafeMemSet((void *)m_args[kernelArgIndex].surfArrayArg, 0, sizeof(SURFACE_ARRAY_ARG) * surfCount);
1255 while (offset < surfCount)
1256 {
1257 switch (currentSurface->Type())
1258 {
1259 case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
1260 {
1261 CmSurface2DRT* surf2D = static_cast<CmSurface2DRT*>(currentSurface);
1262
1263 uint32_t numAliases = 0;
1264 surf2D->GetNumAliases(numAliases);
1265 if (numAliases)
1266 {
1267 m_args[kernelArgIndex].aliasCreated = true;
1268 }
1269 else
1270 {
1271 m_args[kernelArgIndex].aliasCreated = false;
1272 }
1273
1274 //set memory object control
1275 surf2D->GetIndexFor2D(surfRegTableIndex);
1276
1277 surfValue[offset] = surfRegTableIndex;
1278 origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1279
1280 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_2D;
1281 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_2D;
1282
1283 break;
1284 }
1285 case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
1286 {
1287 CmBuffer_RT* surf1D = static_cast<CmBuffer_RT*>(currentSurface);
1288
1289 uint32_t numAliases = 0;
1290 surf1D->GetNumAliases(numAliases);
1291 if (numAliases)
1292 {
1293 m_args[kernelArgIndex].aliasCreated = true;
1294 }
1295 else
1296 {
1297 m_args[kernelArgIndex].aliasCreated = false;
1298 }
1299
1300 //set memory object control
1301 surf1D->GetHandle(handle);
1302
1303 surfValue[offset] = handle;
1304 origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1305
1306 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_1D;
1307 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_1D;
1308 break;
1309 }
1310 case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
1311 {
1312 CmSurface2DUPRT* surf2DUP = static_cast<CmSurface2DUPRT*>(currentSurface);
1313
1314 //set memory object
1315 surf2DUP->GetHandle(handle);
1316
1317 surfValue[offset] = handle;
1318 origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1319
1320 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_2D_UP;
1321 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_2D_UP;
1322 break;
1323 }
1324 case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
1325 {
1326 CmSurface3DRT* surf3D = static_cast<CmSurface3DRT*>(currentSurface);
1327
1328 surf3D->GetHandle(handle);
1329
1330 surfValue[offset] = handle;
1331 origSurfIndex[offset] = (uint16_t)currentSurfIndex;
1332
1333 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_3D;
1334 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_3D;
1335
1336 break;
1337 }
1338
1339 case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER:
1340 {
1341 CmStateBuffer* stateBuffer = static_cast< CmStateBuffer* >( currentSurface );
1342 stateBuffer->GetHandle( handle );
1343
1344 surfValue[ offset ] = handle;
1345 origSurfIndex[ offset ] = ( uint16_t )currentSurfIndex;
1346
1347 m_args[ kernelArgIndex ].surfArrayArg[ offset ].argKindForArray = ARG_KIND_STATE_BUFFER;
1348 m_args[ kernelArgIndex ].unitKind = ARG_KIND_STATE_BUFFER;
1349
1350 break;
1351 }
1352
1353 //sampler surface
1354 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
1355 {
1356 CmSurfaceSampler* surfSampler = static_cast <CmSurfaceSampler *> (currentSurface);
1357 surfSampler->GetHandle(samplerIndex);
1358 surfSampler->GetCmIndexCurrent(samplerCmIndex);
1359
1360 m_surfaceMgr->GetSurface(samplerCmIndex, currentSurface);
1361 if (!currentSurface)
1362 {
1363 CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1364 return CM_NULL_POINTER;
1365 }
1366
1367 surfValue[offset] = samplerIndex;
1368 origSurfIndex[offset] = (uint16_t)samplerCmIndex;
1369
1370 SAMPLER_SURFACE_TYPE type;
1371 surfSampler->GetSurfaceType(type);
1372 if (type == SAMPLER_SURFACE_TYPE_2D)
1373 {
1374 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER;
1375 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER;
1376 }
1377 else if (type == SAMPLER_SURFACE_TYPE_2DUP)
1378 {
1379 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE2DUP_SAMPLER;
1380 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE2DUP_SAMPLER;
1381 }
1382 else if(type == SAMPLER_SURFACE_TYPE_3D)
1383 {
1384 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_3D;
1385 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_3D;
1386 }
1387 else
1388 {
1389 CM_ASSERTMESSAGE("Error: Assign a Sampler surface to the arg which is previously 2D/3D surface.");
1390 return CM_FAILURE;
1391 }
1392 break;
1393 }
1394 //sampler8x8surface
1395 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
1396 {
1397 CmSurfaceSampler8x8* surfSampler8x8 = static_cast <CmSurfaceSampler8x8 *> (currentSurface);
1398 surfSampler8x8->GetIndexCurrent(samplerIndex);
1399 surfSampler8x8->GetCmIndex(samplerCmIndex);
1400
1401 m_surfaceMgr->GetSurface(samplerCmIndex, currentSurface);
1402 if (!currentSurface)
1403 {
1404 CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1405 return CM_FAILURE;
1406 }
1407
1408 surfValue[offset] = samplerIndex;
1409 origSurfIndex[offset] = (uint16_t)samplerCmIndex;
1410
1411 CM_SAMPLER8x8_SURFACE type;
1412 type = surfSampler8x8->GetSampler8x8SurfaceType();
1413 if (type == CM_VA_SURFACE)
1414 {
1415 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1416 m_args[kernelArgIndex].surfArrayArg[offset].addressModeForArray = surfSampler8x8->GetAddressControlMode();
1417 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1418 }
1419 else if(type == CM_AVS_SURFACE)
1420 {
1421 m_args[kernelArgIndex].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1422 m_args[kernelArgIndex].surfArrayArg[offset].argKindForArray = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1423 }
1424 else
1425 {
1426 CM_ASSERTMESSAGE("Error: Assign a Sampler8x8 surface to the arg which is previously 2D surface.");
1427 return CM_FAILURE;
1428 }
1429 break;
1430 }
1431 default:
1432 {
1433 CM_ASSERTMESSAGE("Error: No matched surface for surface array");
1434 return CM_INVALID_ARG_VALUE;
1435 }
1436 }
1437 offset++;
1438 if (offset < surfCount)
1439 {
1440 currentSurfIndex = value[offset].get_data();
1441
1442 while ((!currentSurfIndex && (offset < surfCount))||(currentSurfIndex == CM_NULL_SURFACE))
1443 {
1444 surfValue[offset] = CM_NULL_SURFACE;
1445 origSurfIndex[offset] = 0;
1446 offset++;
1447 if (offset >= surfCount)
1448 break;
1449 currentSurfIndex = value[offset].get_data();
1450 }
1451
1452 if(surfaceArraySize == 0)
1453 {
1454 CM_ASSERTMESSAGE("Error: No surface in surface array");
1455 return CM_NO_AVAILABLE_SURFACE;
1456 }
1457 if (currentSurfIndex > surfaceArraySize)
1458 {
1459 currentSurfIndex = currentSurfIndex % surfaceArraySize;
1460 }
1461 }
1462 if (offset < surfCount)
1463 {
1464 m_surfaceMgr->GetSurface(currentSurfIndex, currentSurface);
1465 if (nullptr == currentSurface)
1466 {
1467 CM_ASSERTMESSAGE("Error: Pointer to current surface is null.");
1468 return CM_FAILURE;
1469 }
1470 }
1471 }
1472 return CM_SUCCESS;
1473 }
1474 //*-----------------------------------------------------------------------------
1475 // Set arguments for function SetKernelArg() and SetThreadArg()
1476 // Set parameter nArgType to CM_KERNEL_INTERNEL_ARG_KERNEL to set a kernel
1477 // argument; set parameter nArgType to CM_KERNEL_INTERNEL_ARG_THREAD to set
1478 // a thread argument
1479 //*-----------------------------------------------------------------------------
SetArgsInternal(CM_KERNEL_INTERNAL_ARG_TYPE nArgType,uint32_t index,size_t size,const void * value,uint32_t nThreadID)1480 int32_t CmKernelRT::SetArgsInternal( CM_KERNEL_INTERNAL_ARG_TYPE nArgType, uint32_t index, size_t size, const void *value, uint32_t nThreadID )
1481 {
1482 uint32_t surfRegTableIndex = 0; // for 2D surf
1483 uint32_t handle = 0; // for 1D surf
1484
1485 uint32_t samplerIndex;
1486 uint16_t samplerCmIndex;
1487 uint32_t samplerIdx = 0;
1488 uint32_t vmeIdx = 0;
1489 uint16_t *surfIndexValue = nullptr;
1490 uint32_t surfaces[CM_MAX_ARGS_PER_KERNEL];
1491 uint16_t surfIndexArray[CM_MAX_ARGS_PER_KERNEL];
1492 std::vector< int > sampler_index_array;
1493
1494 //Clear "set" flag in case user call API to set the same one argument multiple times.
1495 m_args[index].isSet = false;
1496 if( m_args[ index ].unitKind == ARG_KIND_GENERAL || (m_args[index].unitKind == ARG_KIND_GENERAL_DEPVEC) || (m_args[index].unitKind == ARG_KIND_GENERAL_DEPCNT))
1497 {
1498 if( size != m_args[ index ].unitSize )
1499 {
1500 CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
1501 return CM_INVALID_ARG_SIZE;
1502 }
1503 }
1504 //For surface type
1505 else if (CHECK_SURFACE_TYPE(m_args[index].unitKind,
1506 ARG_KIND_SURFACE,
1507 ARG_KIND_SURFACE_1D,
1508 ARG_KIND_SURFACE_2D,
1509 ARG_KIND_SURFACE_2D_UP,
1510 ARG_KIND_SURFACE_3D,
1511 ARG_KIND_SURFACE_SAMPLER,
1512 ARG_KIND_SURFACE2DUP_SAMPLER,
1513 ARG_KIND_SURFACE_VME,
1514 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
1515 ARG_KIND_SURFACE_SAMPLER8X8_VA,
1516 ARG_KIND_SURFACE_2D_SCOREBOARD,
1517 ARG_KIND_STATE_BUFFER
1518 ))
1519 {
1520
1521 // this code is to convert SurfaceIndex object to index of type uint32_t,
1522 // which is expected by commonISA/genBinary
1523 // index is the index of the surface in surface registration table of CM device
1524 // in driver
1525
1526 int signatureSize = m_args[index].unitSize;
1527 int numSurfaces = signatureSize / sizeof(int);
1528 SurfaceIndex* surfIndex = (SurfaceIndex*)value;
1529 if (surfIndex == (SurfaceIndex*)CM_NULL_SURFACE)
1530 {
1531 m_args[index].isSet = true;
1532 m_args[index].unitCount = 1; // per kernel arg
1533 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1534 m_perKernelArgExists = true;
1535 m_args[index].isDirty = true;
1536 m_args[index].isNull = true;
1537 return CM_SUCCESS;
1538 }
1539 else
1540 {
1541 // In case that CM_NULL_SURFACE was set at last time and will
1542 // set a read surface index this time. So need set isDirty as
1543 // well to indicate update kernel data.
1544 if (m_args[index].isNull == true)
1545 {
1546 m_args[index].isDirty = true;
1547 m_args[index].isNull = false;
1548 }
1549 }
1550
1551 m_args[index].isNull = false;
1552 CM_SURFACE_MEM_OBJ_CTRL memCtl;
1553
1554 if (m_args[index].unitKind != ARG_KIND_SURFACE_VME)
1555 {
1556 if (size != sizeof(SurfaceIndex)* numSurfaces)
1557 {
1558 CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
1559 return CM_INVALID_ARG_SIZE;
1560 }
1561 }
1562
1563 uint32_t surfIndexData = surfIndex->get_data();
1564 int i = 0;
1565 uint32_t surfaceArraySize = 0;
1566 m_surfaceMgr->GetSurfaceArraySize(surfaceArraySize);
1567
1568 if (surfIndexData > surfaceArraySize)
1569 {
1570 if (m_args[index].aliasIndex != surfIndexData)
1571 {
1572 m_args[index].isDirty = true;
1573 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1574 m_args[index].aliasIndex = surfIndexData;
1575 }
1576
1577 surfIndexData = surfIndexData % surfaceArraySize;
1578 }
1579 else
1580 {
1581 m_args[index].aliasIndex = 0;
1582 }
1583
1584 while (!surfIndexData && (i < numSurfaces))
1585 {
1586 surfaces[i] = CM_NULL_SURFACE;
1587 surfIndexArray[i] = 0;
1588 i++;
1589 if (i >= numSurfaces)
1590 break;
1591 surfIndexData = surfIndex[i].get_data();
1592 }
1593
1594 if (i >= numSurfaces)
1595 {
1596 m_args[index].unitKind = ARG_KIND_SURFACE_2D;
1597 value = surfaces;
1598 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1599 m_args[index].unitSize = (uint16_t)size;
1600 goto finish;
1601 }
1602 CmSurface* surface = nullptr;
1603 m_surfaceMgr->GetSurface(surfIndexData, surface);
1604 if (nullptr == surface)
1605 {
1606 CM_ASSERTMESSAGE("Error: Invalid surface.");
1607 return CM_FAILURE;
1608 }
1609
1610 if (SurfTypeToArgKind(surface->Type()) != m_args[index].unitKind)
1611 { // if surface type changes i.e 2D <-> 2DUP Need to set bIsDrity as true
1612 m_args[index].isDirty = true;
1613 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1614 }
1615
1616 uint32_t cisaMajorVersion, cisaMinorVersion;
1617 m_program->GetCISAVersion(cisaMajorVersion, cisaMinorVersion);
1618
1619 //This path is for surface array, including 1D, 2D, 3D,samplersurface, samplersurface8x8
1620 if ((numSurfaces > 1) && (surface->Type() != CM_ENUM_CLASS_TYPE_CMSURFACEVME))
1621 {
1622 int32_t hr = SetArgsInternalSurfArray(i,index, numSurfaces, surface, surfIndexData, surfIndex,surfaces, surfIndexArray);
1623 if (hr != CM_SUCCESS)
1624 {
1625 CM_ASSERTMESSAGE("Error: SetArgsInternal for surface array failed!\n");
1626 return CM_INVALID_ARG_VALUE;
1627 }
1628 value = surfaces;
1629 surfIndexValue = surfIndexArray;
1630 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1631 m_args[index].unitSize = (uint16_t)size;
1632 }
1633 else
1634 { //This is for single surface and surface array for VME surface
1635 switch (surface->Type())
1636 {
1637 case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
1638 {
1639 CmSurface2DRT* surf2D = static_cast<CmSurface2DRT*>(surface);
1640
1641 uint32_t numAliases = 0;
1642 surf2D->GetNumAliases(numAliases);
1643 if (numAliases)
1644 {
1645 m_args[index].aliasCreated = true;
1646 }
1647 else
1648 {
1649 m_args[index].aliasCreated = false;
1650 }
1651
1652 //set memory object control
1653 surf2D->GetIndexFor2D(surfRegTableIndex);
1654
1655 surfaces[i] = surfRegTableIndex;
1656 surfIndexArray[i] = (uint16_t)surfIndexData;
1657
1658 value = surfaces;
1659 surfIndexValue = surfIndexArray;
1660
1661 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1662 m_args[index].unitSize = (uint16_t)size;
1663
1664 if ((m_args[index].unitKind == ARG_KIND_SURFACE) || (m_args[index].unitKind == ARG_KIND_SURFACE_2D_UP)) // first time or last time is set to 2DUP
1665 {
1666 m_args[index].unitKind = ARG_KIND_SURFACE_2D;
1667 if (m_args[index].surfaceKind == SAMPLER_SURF)
1668 m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER;
1669 }
1670 else if (m_args[index].unitKind != ARG_KIND_SURFACE_2D &&
1671 m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER &&
1672 m_args[index].unitKind != ARG_KIND_SURFACE2DUP_SAMPLER &&
1673 m_args[index].unitKind != ARG_KIND_SURFACE_2D_SCOREBOARD)
1674 {
1675 CM_ASSERTMESSAGE("Error: Assign a 2D surface to the arg which is previously assigned 1D surface, 3D surface, or VME surface.");
1676 return CM_INVALID_ARG_VALUE;
1677 }
1678 break;
1679 }
1680 case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
1681 {
1682 CmBuffer_RT* surf1D = static_cast<CmBuffer_RT*>(surface);
1683
1684 uint32_t numAliases = 0;
1685 surf1D->GetNumAliases(numAliases);
1686 if (numAliases)
1687 {
1688 m_args[index].aliasCreated = true;
1689 }
1690 else
1691 {
1692 m_args[index].aliasCreated = false;
1693 }
1694
1695 //set memory object control
1696 surf1D->GetHandle(handle);
1697
1698 surfaces[i] = handle;
1699 surfIndexArray[i] = (uint16_t)surfIndexData;
1700
1701 value = surfaces;
1702 surfIndexValue = surfIndexArray;
1703
1704 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1705 m_args[index].unitSize = (uint16_t)size;
1706
1707 if (m_args[index].unitKind == ARG_KIND_SURFACE)
1708 {
1709 m_args[index].unitKind = ARG_KIND_SURFACE_1D;
1710 }
1711 else if (m_args[index].unitKind != ARG_KIND_SURFACE_1D)
1712 {
1713 CM_ASSERTMESSAGE("Error: Assign a 1D surface to the arg which is previously assigned 2D surface, 3D surface, or VME surface.");
1714 return CM_INVALID_ARG_VALUE;
1715 }
1716 break;
1717 }
1718 case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
1719 {
1720 CmSurface2DUPRT* surf2DUP = static_cast<CmSurface2DUPRT*>(surface);
1721
1722 //set memory object
1723 surf2DUP->GetHandle(handle);
1724
1725 surfaces[i] = handle;
1726 surfIndexArray[i] = (uint16_t)surfIndexData;
1727
1728 value = surfaces;
1729 surfIndexValue = surfIndexArray;
1730
1731 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1732 m_args[index].unitSize = (uint16_t)size;
1733
1734 if ((m_args[index].unitKind == ARG_KIND_SURFACE) || (m_args[index].unitKind == ARG_KIND_SURFACE_2D)) // first time or last time is set to 2D
1735 {
1736 m_args[index].unitKind = ARG_KIND_SURFACE_2D_UP;
1737 }
1738 else if (m_args[index].unitKind != ARG_KIND_SURFACE_2D_UP)
1739 {
1740 CM_ASSERTMESSAGE("Error: Assign a 2D surface UP to the arg which is previously assigned other surfaces.");
1741 return CM_INVALID_ARG_VALUE;
1742 }
1743
1744 break;
1745 }
1746 case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
1747 {
1748 CmSurface3DRT* surf3D = static_cast<CmSurface3DRT*>(surface);
1749
1750 surf3D->GetHandle(handle);
1751
1752 surfaces[i] = handle;
1753 surfIndexArray[i] = (uint16_t)surfIndexData;
1754
1755 value = surfaces;
1756 surfIndexValue = surfIndexArray;
1757
1758 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1759 m_args[index].unitSize = (uint16_t)size;
1760
1761 if (m_args[index].unitKind == ARG_KIND_SURFACE) // first time
1762 {
1763 m_args[index].unitKind = ARG_KIND_SURFACE_3D;
1764 }
1765 else if (m_args[index].unitKind != ARG_KIND_SURFACE_3D)
1766 {
1767 CM_ASSERTMESSAGE("Error: Assign a 3D surface to the arg which is previously assigned 1D surface, 2D surface or VME surface");
1768 return CM_INVALID_ARG_VALUE;
1769 }
1770 break;
1771 }
1772
1773 case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER:
1774 {
1775 CmStateBuffer* stateBuffer = static_cast< CmStateBuffer* >( surface );
1776 stateBuffer->GetHandle( handle );
1777
1778 surfaces[ i ] = handle;
1779 surfIndexArray[ i ] = ( uint16_t )surfIndexData;
1780
1781 value = surfaces;
1782 surfIndexValue = surfIndexArray;
1783
1784 size = ( size / sizeof( SurfaceIndex ) ) * sizeof( uint32_t );
1785 m_args[ index ].unitSize = ( uint16_t )size;
1786
1787 if ( m_args[ index ].unitKind == ARG_KIND_SURFACE ) // first time
1788 {
1789 m_args[ index ].unitKind = ARG_KIND_STATE_BUFFER;
1790 }
1791 else if ( m_args[ index ].unitKind != ARG_KIND_STATE_BUFFER )
1792 {
1793 CM_ASSERTMESSAGE( "Error: Assign a state buffer to the arg which is previously assigned 1D surface, 2D surface, 3D surface or VME surface" );
1794 return CM_INVALID_ARG_VALUE;
1795 }
1796 break;
1797 }
1798
1799 case CM_ENUM_CLASS_TYPE_CMSURFACEVME:
1800 {
1801 return SetArgsVme(nArgType, index, value, nThreadID);
1802 }
1803 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
1804 {
1805 CmSurfaceSampler8x8* surfSampler8x8 = static_cast <CmSurfaceSampler8x8 *> (surface);
1806 surfSampler8x8->GetIndexCurrent(samplerIndex);
1807 surfSampler8x8->GetCmIndex(samplerCmIndex);
1808 if (samplerCmIndex > surfaceArraySize)
1809 {
1810 m_args[index].aliasIndex = samplerCmIndex;
1811 m_args[index].aliasCreated = true;
1812 samplerCmIndex %= surfaceArraySize;
1813 }
1814
1815 m_surfaceMgr->GetSurface(samplerCmIndex, surface);
1816 if (!surface)
1817 {
1818 CM_ASSERTMESSAGE("Error: Invalid sampler8x8 surface.");
1819 return CM_FAILURE;
1820 }
1821
1822 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1823 m_args[index].unitSize = (uint16_t)size;
1824
1825 value = &samplerIndex;
1826 surfIndexValue = &samplerCmIndex;
1827
1828 if (m_args[index].unitKind == ARG_KIND_SURFACE)
1829 {
1830 if (surfSampler8x8->GetSampler8x8SurfaceType() == CM_VA_SURFACE)
1831 {
1832 m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
1833 m_args[index].nCustomValue = surfSampler8x8->GetAddressControlMode();
1834 }
1835 else
1836 {
1837 m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
1838 }
1839 }
1840 else if (m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER8X8_AVS &&
1841 m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER8X8_VA)
1842 {
1843 CM_ASSERTMESSAGE("Error: Assign a Sampler8x8 surface to the arg which is previously 2D surface.");
1844 return CM_FAILURE;
1845 }
1846 break;
1847 }
1848 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
1849 {
1850 CmSurfaceSampler* surfSampler = static_cast <CmSurfaceSampler *> (surface);
1851 surfSampler->GetHandle(samplerIndex);
1852 surfSampler->GetCmIndexCurrent(samplerCmIndex);
1853
1854 m_surfaceMgr->GetSurface(samplerCmIndex, surface);
1855 if (!surface)
1856 {
1857 CM_ASSERTMESSAGE("Error: Invalid sampler surface.");
1858 return CM_FAILURE;
1859 }
1860
1861 size = (size / sizeof(SurfaceIndex)) * sizeof(uint32_t);
1862 m_args[index].unitSize = (uint16_t)size;
1863
1864 value = &samplerIndex;
1865 surfIndexValue = &samplerCmIndex;
1866
1867 if (m_args[index].unitKind == ARG_KIND_SURFACE)
1868 { // first time
1869 SAMPLER_SURFACE_TYPE type;
1870 surfSampler->GetSurfaceType(type);
1871 if (type == SAMPLER_SURFACE_TYPE_2D)
1872 {
1873 m_args[index].unitKind = ARG_KIND_SURFACE_SAMPLER;
1874 }
1875 else if (type == SAMPLER_SURFACE_TYPE_2DUP)
1876 {
1877 m_args[index].unitKind = ARG_KIND_SURFACE2DUP_SAMPLER;
1878 }
1879 else
1880 {
1881 m_args[index].unitKind = ARG_KIND_SURFACE_3D;
1882 }
1883
1884 }
1885 else if ((m_args[index].unitKind != ARG_KIND_SURFACE_SAMPLER) &&
1886 m_args[index].unitKind != ARG_KIND_SURFACE2DUP_SAMPLER &&
1887 (m_args[index].unitKind != ARG_KIND_SURFACE_3D))
1888 {
1889 CM_ASSERTMESSAGE("Error: Assign a Sampler surface to the arg which is previously 2D/3D surface.");
1890 return CM_FAILURE;
1891 }
1892 break;
1893 }
1894 default:
1895 {
1896 CM_ASSERTMESSAGE("Error: Invalid surface type.");
1897 return CM_INVALID_ARG_VALUE;
1898 }
1899 }
1900 }
1901 }
1902 else if (m_args[index].unitKind == ARG_KIND_SAMPLER)
1903 {
1904 unsigned int numSamplers = m_args[index].unitSize / sizeof(int);
1905
1906 if (numSamplers > 1)
1907 {
1908 size = numSamplers * sizeof(unsigned int);
1909
1910 for (unsigned int i = 0; i < numSamplers; i++)
1911 {
1912 SamplerIndex* samplerIndex = (SamplerIndex*)value + i;
1913 samplerIdx = samplerIndex->get_data();
1914 sampler_index_array.push_back(samplerIdx);
1915 }
1916 }
1917 else
1918 {
1919 SamplerIndex* samplerIndex = (SamplerIndex*)value;
1920 samplerIdx = ((SamplerIndex*)value)->get_data();
1921 size = sizeof(unsigned int);
1922 m_args[index].unitSize = (uint16_t)size;
1923 value = &samplerIdx;
1924 }
1925 }
1926
1927 finish:
1928 if ( nArgType == CM_KERNEL_INTERNEL_ARG_PERKERNEL ) // per kernel arg
1929 {
1930 CM_ARG& arg = m_args[ index ];
1931
1932 // Assume from now on, size is valid, i.e. confirmed with function signature
1933 if( !arg.value )
1934 {
1935 //Increment size kernel arguments will take up in CURBE
1936 uint32_t tempUnitSize = m_args[ index ].unitSize;
1937 if( (m_args[index].unitKind == ARG_KIND_SURFACE_VME ) ||
1938 (m_args[index].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
1939 (m_args[index].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ))
1940 {
1941 tempUnitSize = CM_ARGUMENT_SURFACE_SIZE;
1942 }
1943
1944 // first setKernelArg or first setKernelArg after each enqueue
1945 arg.value = MOS_NewArray(uint8_t,size);
1946 if( !arg.value )
1947 {
1948 CM_ASSERTMESSAGE("Error: Out of system memory.");
1949 return CM_OUT_OF_HOST_MEMORY;
1950 }
1951
1952 arg.unitCount = 1;
1953
1954 CmSafeMemCopy((void *)arg.value, value, size);
1955
1956 if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
1957 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
1958 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
1959 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
1960 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
1961 ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
1962 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
1963 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
1964 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
1965 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
1966 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
1967 ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
1968 {
1969 arg.surfIndex = MOS_NewArray(uint16_t, (size / sizeof(int32_t)));
1970 if (!arg.surfIndex)
1971 {
1972 CM_ASSERTMESSAGE("Error: Out of system memory.");
1973 MosSafeDeleteArray(arg.value);
1974 return CM_OUT_OF_HOST_MEMORY;
1975 }
1976 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(int32_t) * sizeof(uint16_t));
1977 CmSafeMemCopy((void *)arg.surfIndex, surfIndexValue, size / sizeof(int32_t) * sizeof(uint16_t));
1978 }
1979
1980 if (m_args[index].unitKind == ARG_KIND_SAMPLER)
1981 {
1982 for (unsigned int samplerIndex = 0; samplerIndex < sampler_index_array.size(); samplerIndex++)
1983 {
1984 *( (int *)arg.value + samplerIndex) = sampler_index_array[samplerIndex];
1985 }
1986 }
1987
1988 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
1989 arg.isDirty = true;
1990 }
1991 else
1992 {
1993 if( arg.unitCount != 1 )
1994 {
1995 CM_ASSERTMESSAGE("Error: Invalid arg count.");
1996 return CM_FAILURE;
1997 }
1998 if( memcmp( (void *)arg.value, value, size ) != 0 )
1999 {
2000 CmSafeMemCopy((void *)arg.value, value, size);
2001 m_dirty |= CM_KERNEL_DATA_KERNEL_ARG_DIRTY;
2002 arg.isDirty = true;
2003 }
2004 if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2005 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2006 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2007 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2008 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2009 ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2010 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2011 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2012 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2013 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2014 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2015 ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2016 {
2017 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(int32_t) * sizeof(uint16_t));
2018 CmSafeMemCopy((void *)arg.surfIndex, surfIndexValue, size/sizeof(int32_t) * sizeof(uint16_t));
2019 }
2020
2021 if (m_args[index].unitKind == ARG_KIND_SAMPLER)
2022 {
2023 for (unsigned int samplerIndex = 0; samplerIndex < sampler_index_array.size(); samplerIndex++)
2024 {
2025 *((int *)arg.value + samplerIndex) = sampler_index_array[samplerIndex];
2026 }
2027 }
2028 }
2029
2030 m_perKernelArgExists = true;
2031 }
2032 else //per thread arg
2033 {
2034 CM_ARG& arg = m_args[ index ];
2035
2036 // Assume from now on, size is valid, i.e. confirmed with function signature
2037 if( !arg.value )
2038 {
2039 //Increment size per-thread arguments will take up in payload of media object or media object walker commands
2040 m_sizeInPayload += arg.unitSize;
2041 DW_ALIGNMENT(m_sizeInPayload);
2042
2043 // first setThreadArg or first setThreadArg after each enqueue
2044 arg.value = MOS_NewArray(uint8_t, (size * m_threadCount));
2045 if( !arg.value )
2046 {
2047 CM_ASSERTMESSAGE("Error: Out of system memory.");
2048 return CM_OUT_OF_HOST_MEMORY;
2049
2050 }
2051 arg.unitCount = m_threadCount;
2052
2053 uint32_t offset = size * nThreadID;
2054 uint8_t *threadValue = ( uint8_t *)arg.value;
2055 threadValue += offset;
2056
2057 CmSafeMemCopy(threadValue, value, size);
2058 if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2059 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2060 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2061 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2062 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2063 ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2064 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2065 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2066 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2067 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2068 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2069 ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2070 {
2071 arg.surfIndex = MOS_NewArray(uint16_t, (size / sizeof(uint32_t) * m_threadCount));
2072 if( !arg.surfIndex )
2073 {
2074 CM_ASSERTMESSAGE("Error: Out of system memory.");
2075 MosSafeDeleteArray(arg.value);
2076 return CM_OUT_OF_HOST_MEMORY;
2077 }
2078 CmSafeMemSet((void *)arg.surfIndex, 0, size/sizeof(uint32_t) * sizeof(uint16_t) * m_threadCount);
2079 CmSafeMemCopy((void *)(arg.surfIndex + size/sizeof(uint32_t) * nThreadID), surfIndexValue, size/sizeof(uint32_t) * sizeof(uint16_t));
2080 }
2081 m_perThreadArgExists = true;
2082 }
2083 else
2084 {
2085 if( arg.unitCount != m_threadCount )
2086 {
2087 CM_ASSERTMESSAGE("Error: arg count is not matched with thread count.");
2088 return CM_FAILURE;
2089
2090 }
2091 uint32_t offset = size * nThreadID;
2092 uint8_t *threadValue = ( uint8_t *)arg.value;
2093 threadValue += offset;
2094
2095 if( memcmp( threadValue, value, size ) != 0 )
2096 {
2097 CmSafeMemCopy(threadValue, value, size);
2098 m_dirty |= CM_KERNEL_DATA_THREAD_ARG_DIRTY;
2099 arg.isDirty = true;
2100 }
2101 if((( m_args[ index ].unitKind == ARG_KIND_SURFACE ) || // first time
2102 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_1D ) ||
2103 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D ) ||
2104 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
2105 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
2106 ( m_args[ index ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
2107 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_3D ) ||
2108 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_VME ) ||
2109 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
2110 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
2111 ( m_args[ index ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
2112 ( m_args[ index ].unitKind == ARG_KIND_STATE_BUFFER ) ) && surfIndexValue )
2113 {
2114 CmSafeMemCopy((void *)(arg.surfIndex + size/sizeof(uint32_t) * nThreadID), surfIndexValue, size/sizeof(uint32_t) * sizeof(uint16_t));
2115 }
2116 }
2117 }
2118
2119 m_args[index].isSet = true;
2120
2121 return CM_SUCCESS;
2122 }
2123
2124 //*-----------------------------------------------------------------------------
2125 //! Set per kernel arguments. The total size of all per kernel arguments and per thread
2126 //! arguments should be less than or equal to 256 Bytes (CM_MAX_ARG_SIZE_IN_BYTE).
2127 //! The life time of all per kernel arguments and per thread lasts untill the next enqueue
2128 //! i.e. after enqueue, ALL arguments need to be reset.
2129 //! INPUT:
2130 //! 1) Index of argument in CM kernel function (genx_main). The index is
2131 //! global for per kernel arguments and per thread arguments.
2132 //! 2) Size of the argument.
2133 //! 3) Pointer to argument value.
2134 //! OUTPUT:
2135 //! CM_SUCCESS or
2136 //! CM_INVALID_ARG_INDEX if index is invalid;
2137 //! CM_INVALID_ARG_SIZE if size is invalid;
2138 //! CM_INVALID_ARG_VALUE if value is NULL.
2139 //*-----------------------------------------------------------------------------
SetKernelArg(uint32_t index,size_t size,const void * value)2140 CM_RT_API int32_t CmKernelRT::SetKernelArg(uint32_t index, size_t size, const void * value )
2141 {
2142 INSERT_API_CALL_LOG(GetHalState());
2143 //It should be mutual exclusive with Indirect Data
2144 if(m_kernelPayloadData)
2145 {
2146 CM_ASSERTMESSAGE("Error: SetKernelArg should be mutual exclusive with indirect data.");
2147 return CM_KERNELPAYLOAD_PERKERNELARG_MUTEX_FAIL;
2148 }
2149
2150 if( index >= m_argCount )
2151 {
2152 CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
2153 return CM_INVALID_ARG_INDEX;
2154
2155 }
2156
2157 if( !value)
2158 {
2159 CM_ASSERTMESSAGE("Error: Invalid kernel arg value.");
2160 return CM_INVALID_ARG_VALUE;
2161 }
2162
2163 if( size == 0)
2164 {
2165 CM_ASSERTMESSAGE("Error: Invalid kernel arg size.");
2166 return CM_INVALID_ARG_SIZE;
2167 }
2168
2169 int32_t nRetVal = 0;
2170 if ( ( nRetVal = SetArgsInternal( CM_KERNEL_INTERNEL_ARG_PERKERNEL, index, size, value ) ) != CM_SUCCESS )
2171 {
2172 return nRetVal;
2173 }
2174
2175 return CM_SUCCESS;
2176 }
2177
SetKernelArgPointer(uint32_t index,size_t size,const void * value)2178 CM_RT_API int32_t CmKernelRT::SetKernelArgPointer(uint32_t index, size_t size, const void *value)
2179 {
2180 INSERT_API_CALL_LOG(GetHalState());
2181
2182 //It should be mutual exclusive with Indirect Data
2183 if (m_kernelPayloadData)
2184 {
2185 CM_ASSERTMESSAGE("Error: SetKernelArg should be mutual exclusive with indirect data.");
2186 return CM_KERNELPAYLOAD_PERKERNELARG_MUTEX_FAIL;
2187 }
2188
2189 if (index >= m_argCount)
2190 {
2191 CM_ASSERTMESSAGE("Error: Invalid kernel arg count.");
2192 return CM_INVALID_ARG_INDEX;
2193 }
2194
2195 if (!value)
2196 {
2197 CM_ASSERTMESSAGE("Error: Invalid kernel arg value.");
2198 return CM_INVALID_ARG_VALUE;
2199 }
2200
2201 uint64_t *argValue = MOS_NewArray(uint64_t, 1);
2202 if (!argValue)
2203 {
2204 CM_ASSERTMESSAGE("Error: Out of system memory.");
2205 return CM_OUT_OF_HOST_MEMORY;
2206 }
2207 CmSafeMemSet(argValue, 0, sizeof(uint64_t));
2208 CmSafeMemCopy(argValue, value, size);
2209
2210 // Get the gfx start address of SVM/stateless buffer.
2211 uint64_t gfxAddress = *(argValue);
2212 MosSafeDeleteArray(argValue);
2213
2214 // Check the gfx start address is valid or not
2215 std::set<CmSurface *> statelessSurfArray = m_surfaceMgr->GetStatelessSurfaceArray();
2216 bool valid = false;
2217 for(auto surface : statelessSurfArray)
2218 {
2219 CmBuffer_RT *buffer = static_cast<CmBuffer_RT *>(surface);
2220 uint64_t startAddress = 0;
2221 buffer->GetGfxAddress(startAddress);
2222 size_t size = buffer->GetSize();
2223
2224 if (gfxAddress >= startAddress
2225 && gfxAddress < (startAddress + size))
2226 {
2227 SurfaceIndex *surfIndex = nullptr;
2228 buffer->GetIndex(surfIndex);
2229 uint32_t surfIndexData = surfIndex->get_data();
2230 m_surfaceArray[surfIndexData] = true;
2231
2232 m_args[index].isStatelessBuffer = true;
2233 m_args[index].index = (uint16_t)surfIndexData;
2234
2235 valid = true;
2236 break;
2237 }
2238 }
2239 if (!valid)
2240 {
2241 CM_ASSERTMESSAGE("Error: the kernel arg pointer is not valid.");
2242 return CM_INVALID_KERNEL_ARG_POINTER;
2243 }
2244
2245 int32_t nRetVal = SetArgsInternal(CM_KERNEL_INTERNEL_ARG_PERKERNEL,
2246 index,
2247 size,
2248 value);
2249 if (nRetVal != CM_SUCCESS)
2250 {
2251 return nRetVal;
2252 }
2253
2254 return CM_SUCCESS;
2255 }
2256
2257 //*-----------------------------------------------------------------------------
2258 //| Purpose: Set Static Buffer
2259 //| Return : The result of operation
2260 //*-----------------------------------------------------------------------------
SetStaticBuffer(uint32_t index,const void * value)2261 CM_RT_API int32_t CmKernelRT::SetStaticBuffer(uint32_t index, const void * value )
2262 {
2263 INSERT_API_CALL_LOG(GetHalState());
2264 if(index >= CM_GLOBAL_SURFACE_NUMBER)
2265 {
2266 CM_ASSERTMESSAGE("Error: Surface Index exceeds max global surface number.");
2267 return CM_INVALID_GLOBAL_BUFFER_INDEX;
2268 }
2269
2270 if(!value)
2271 {
2272 CM_ASSERTMESSAGE("Error: Invalid StaticBuffer arg value.");
2273 return CM_INVALID_BUFFER_HANDLER;
2274 }
2275
2276 SurfaceIndex* surfIndex = (SurfaceIndex* )value;
2277 uint32_t surfIndexData = surfIndex->get_data();
2278 if (surfIndexData >= m_surfaceMgr->GetSurfacePoolSize())
2279 {
2280 CM_ASSERTMESSAGE("Error: StaticBuffer doesn't allow alias index.");
2281 return CM_INVALID_ARG_INDEX;
2282 }
2283
2284 CmSurface* surface = nullptr;
2285 m_surfaceMgr->GetSurface( surfIndexData, surface );
2286 if(surface == nullptr)
2287 {
2288 CM_ASSERTMESSAGE("Error: Invalid surface.");
2289 return CM_INVALID_BUFFER_HANDLER;
2290 }
2291
2292 CmBuffer_RT* surf1D = nullptr;
2293 if ( surface->Type() == CM_ENUM_CLASS_TYPE_CMBUFFER_RT )
2294 {
2295 uint32_t handle = 0; // for 1D surf
2296
2297 surf1D = static_cast< CmBuffer_RT* >( surface );
2298 surf1D->GetHandle( handle );
2299
2300 if (m_globalSurfaces[index] == nullptr)
2301 {
2302 m_globalSurfaces[index] = MOS_New(SurfaceIndex,0);
2303 if( !m_globalSurfaces[index] )
2304 {
2305 CM_ASSERTMESSAGE("Error: Out of system memory.");
2306 return CM_OUT_OF_HOST_MEMORY;
2307 }
2308 }
2309 *m_globalSurfaces[index] = handle;
2310 m_globalCmIndex[index] = surfIndexData;
2311 m_dirty |= CM_KERNEL_DATA_GLOBAL_SURFACE_DIRTY;
2312 }
2313 else
2314 {
2315 CM_ASSERTMESSAGE("Error: StaticBuffer only supports CmBuffer type.");
2316 return CM_INVALID_BUFFER_HANDLER;
2317 }
2318 return CM_SUCCESS;
2319 }
2320
2321 //*-----------------------------------------------------------------------------
2322 //! Set per thread arguments. The total size of all per kernel arguments and per thread
2323 //! arguments should be less than or equal to 256 Bytes
2324 //! The life time of all per kernel arguments and per thread lasts untill the next enqueue
2325 //! i.e. after enqueue, ALL arguments need to be reset.
2326 //! INPUT:
2327 //! 1) Thread index.
2328 //! 2) Index of argument in CM kernel function (genx_main). The index is
2329 //! global for per kernel arguments and per thread arguments.
2330 //! 3) Size of the argument.
2331 //! 4) Pointer to argument .
2332 //! OUTPUT:
2333 //! CM_SUCCESS or
2334 //! CM_INVALID_ARG_INDEX if index is invalid
2335 //! CM_INVALID_ARG_SIZE if size is invalid
2336 //! CM_INVALID_ARG_VALUE if value is nullptr
2337 //*-----------------------------------------------------------------------------
SetThreadArg(uint32_t threadId,uint32_t index,size_t size,const void * value)2338 CM_RT_API int32_t CmKernelRT::SetThreadArg(uint32_t threadId, uint32_t index, size_t size, const void * value )
2339 {
2340 INSERT_API_CALL_LOG(GetHalState());
2341
2342 //It should be mutual exclusive with Indirect Data
2343 if(m_kernelPayloadData)
2344 {
2345 CM_ASSERTMESSAGE("Error: SetThredArg should be mutual exclusive with indirect data.");
2346 return CM_KERNELPAYLOAD_PERTHREADARG_MUTEX_FAIL;
2347 }
2348
2349 if(m_threadCount > m_halMaxValues->maxUserThreadsPerTask || m_threadCount <=0)
2350 {
2351 CM_ASSERTMESSAGE("Error: Minimum or Maximum number of threads exceeded.");
2352 return CM_FAILURE;
2353 }
2354
2355 if( index >= m_argCount )
2356 {
2357 CM_ASSERTMESSAGE("Error: Invalid thread arg count.");
2358 return CM_INVALID_ARG_INDEX;
2359
2360 }
2361
2362 if( threadId >= m_threadCount )
2363 {
2364 CM_ASSERTMESSAGE("Error: thread id exceeds the threadcount.");
2365 return CM_INVALID_THREAD_INDEX;
2366
2367 }
2368
2369 if( !value)
2370 {
2371 CM_ASSERTMESSAGE("Error: Invalid thread arg value.");
2372 return CM_INVALID_ARG_VALUE;
2373 }
2374
2375 if( size == 0)
2376 {
2377 CM_ASSERTMESSAGE("Error: Invalid thread arg size.");
2378 return CM_INVALID_ARG_SIZE;
2379 }
2380
2381 int32_t nRetVal = 0;
2382 if ( ( nRetVal = SetArgsInternal( CM_KERNEL_INTERNEL_ARG_PERTHREAD, index, size, value, threadId ) ) != CM_SUCCESS )
2383 {
2384 return nRetVal;
2385 }
2386
2387 return CM_SUCCESS;
2388 }
2389
2390 //*-----------------------------------------------------------------------------
2391 //| Purpose: Calculate the total size of kernel data
2392 //*-----------------------------------------------------------------------------
CalcKernelDataSize(uint32_t movInstNum,uint32_t numArgs,uint32_t argSize,uint32_t & totalKernelDataSize)2393 int32_t CmKernelRT::CalcKernelDataSize(
2394 uint32_t movInstNum, // [in] the number of move instructions
2395 uint32_t numArgs, // [in] number of args , surface array count
2396 uint32_t argSize, // [in] Size of arguments
2397 uint32_t & totalKernelDataSize) // [out] total size of kernel data
2398 {
2399 int32_t hr = CM_SUCCESS;
2400
2401 uint32_t headSize = ( KERNEL_INFO_SIZE_IN_DWORD + numArgs * PER_ARG_SIZE_IN_DWORD ) * sizeof( uint32_t );
2402 uint32_t totalSize = headSize + movInstNum * CM_MOVE_INSTRUCTION_SIZE + m_binarySize + argSize;
2403
2404 totalSize += 4; // one dword for flag. the first bit is curbe on/off
2405 totalSize += 8; //sizeof( uint64_t ) for id
2406
2407 totalSize += 16; // static buffer indices
2408 totalSize += 12; // GT Pin buffer indices
2409
2410 ////////////////////////////////////////////////////////////////////////////
2411 // Calculate indirect data size (start)
2412 ////////////////////////////////////////////////////////////////////////////
2413 // Memory layout for indirect data:
2414 // Indirect Data Size -------------------- 2 bytes (must present)
2415 // Below area is present only if above value is not ZERO
2416 // Indirect Data Buffer ------------------ Size indicated above
2417 totalSize += sizeof(uint16_t); //field for indirect data size
2418 if(m_usKernelPayloadDataSize)
2419 {
2420 totalSize += m_usKernelPayloadDataSize;
2421 }
2422 // Memory layout for indirect surface:
2423 // Indirect Surface Count ----------------- 2 bytes (must present)
2424 // Below are present only if the above value is not ZERO
2425 // Kind of Indirect Surface 0 ------------- 2 Bytes
2426 // Handle of Indirect Surface 0 ----------- 2 Bytes
2427 // Surface Index of Indirect Surface 0 ---- 2 Bytes
2428 // ..........
2429 // Kind of Indirect Surface n-1 ----------- 2 Bytes
2430 // Handle of Indirect Surface n-1---------- 2 Bytes
2431 // Surface Index of Indirect Surface n-1 -- 2 Bytes
2432 totalSize += sizeof(uint16_t); //field for indirect surface count
2433 if(m_usKernelPayloadSurfaceCount)
2434 {
2435 totalSize += m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO);
2436 }
2437
2438 totalKernelDataSize = totalSize;
2439
2440 return hr;
2441 }
2442
2443 //*-----------------------------------------------------------------------------
2444 //| Purpose: Create mov instructions
2445 //| instructions will be copied into DstMem
2446 //*-----------------------------------------------------------------------------
CreateMovInstructions(uint32_t & movInstNum,uint8_t * & codeDst,CM_ARG * tempArgs,uint32_t numArgs)2447 int32_t CmKernelRT::CreateMovInstructions( uint32_t &movInstNum, uint8_t *&codeDst, CM_ARG* tempArgs, uint32_t numArgs)
2448 {
2449 //Create Mov Instruction
2450 CmDynamicArray movInsts( numArgs );
2451 uint32_t renderGen = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState->platform.eRenderCoreFamily;
2452 CM_RETURN_CODE ret = m_movInstConstructor->SetInstDistanceConfig(movInsts.GetMaxSize(), renderGen);
2453 if (ret != CM_SUCCESS && ret != CM_NOT_IMPLEMENTED)
2454 {
2455 return ret;
2456 }
2457
2458 movInstNum = 0;
2459
2460 //Note: if no thread arg and no per kernel arg, no need move instrcutions at all.
2461 if( m_curbeEnabled && (m_perThreadArgExists || m_perKernelArgExists))
2462 {
2463 if( ( m_argCount > 0 ) && ( m_threadCount > 1) )
2464 {
2465 PCM_ARG* sortedArgs = MOS_NewArray(PCM_ARG,numArgs);
2466 if( !sortedArgs )
2467 {
2468 CM_ASSERTMESSAGE("Error: Out of system memory.");
2469 return CM_OUT_OF_HOST_MEMORY;
2470 }
2471 for( uint32_t j = 0; j < numArgs; j++ )
2472 {
2473 sortedArgs[ j ] = tempArgs + j;
2474 }
2475 // sort arg to sortedArgs accorind to offsetinPayload
2476 QuickSort( sortedArgs, 0, numArgs - 1 );
2477
2478 // record compiler generated offset, used as move dst later
2479 uint16_t *unitOffsetInPayloadSorted = MOS_NewArray(uint16_t, numArgs);
2480 if( !unitOffsetInPayloadSorted )
2481 {
2482 CM_ASSERTMESSAGE("Error: Out of system memory.");
2483 MosSafeDeleteArray(sortedArgs);
2484 return CM_OUT_OF_HOST_MEMORY;
2485 }
2486 for( uint32_t j = 0; j < numArgs; j++ )
2487 {
2488 unitOffsetInPayloadSorted[j] = sortedArgs[j]->unitOffsetInPayload;
2489 }
2490
2491 uint16_t kernelArgEnd = 32;
2492 bool beforeFirstThreadArg = true;
2493 for( uint32_t j = 0; j < numArgs; j++ )
2494 {
2495 if( sortedArgs[j]->unitCount == 1 )
2496 // consider m_threadCount = 1 case later, where all args are treated as per thread arg
2497 {
2498 if( beforeFirstThreadArg )
2499 {
2500 kernelArgEnd = sortedArgs[j]->unitOffsetInPayload + sortedArgs[j]->unitSize;
2501 }
2502 else
2503 {
2504 DW_ALIGNMENT( kernelArgEnd ); // necessary ?
2505 sortedArgs[j]->unitOffsetInPayload = kernelArgEnd;
2506 kernelArgEnd += sortedArgs[j]->unitSize;
2507 }
2508 }
2509 else // per thread
2510 {
2511 if( beforeFirstThreadArg )
2512 {
2513 beforeFirstThreadArg = false;
2514 }
2515 }
2516 }
2517
2518 GRF_ALIGNMENT(kernelArgEnd); // offset of thread arg start related to R0
2519 uint32_t threadArgStart = kernelArgEnd;
2520
2521 for (uint32_t j = 0; j < numArgs; j++)
2522 {
2523 if (sortedArgs[j]->unitCount > 1) // per thread
2524 {
2525 sortedArgs[j]->unitOffsetInPayload = (uint16_t)threadArgStart;
2526 threadArgStart += sortedArgs[j]->unitSize;
2527 DW_ALIGNMENT(threadArgStart);
2528 }
2529 }
2530
2531 bool needMovInstructions = false;
2532 for( uint32_t j = 0; j < numArgs; j++ )
2533 {
2534 if ( unitOffsetInPayloadSorted[j] != sortedArgs[j]->unitOffsetInPayload )
2535 {
2536 needMovInstructions = true;
2537 break;
2538 }
2539 }
2540
2541 if (needMovInstructions)
2542 {
2543 // Add move
2544 GRF_ALIGNMENT(threadArgStart);
2545 uint32_t threadArgEnd = threadArgStart;
2546 uint32_t size = threadArgEnd - 32;
2547 CM_ASSERT((size % 32) == 0);
2548
2549 // move all arguments starting from R1 (32 ) through threadArgEnd to R64 (R0 reserved for media dispatch)
2550 uint32_t nextIndex = 0;
2551 nextIndex += m_movInstConstructor->ConstructObjMovs(R64_OFFSET, 32, size, movInsts, nextIndex, true, m_blhwDebugEnable);
2552
2553 beforeFirstThreadArg = true;
2554 for (uint32_t j = 0; j < numArgs; j++)
2555 {
2556 if (sortedArgs[j]->unitCount == 1)
2557 // consider m_threadCount = 1 case later, where all args are treated as per thread arg
2558 {
2559 if (beforeFirstThreadArg == false)
2560 {
2561 // add move inst to move from sortedArgs[j]->unitOffsetInPayload + R64 to unitOffsetInPayloadSorted[j]
2562 nextIndex += m_movInstConstructor->ConstructObjMovs(unitOffsetInPayloadSorted[j],
2563 R64_OFFSET + sortedArgs[j]->unitOffsetInPayload - 32,
2564 sortedArgs[j]->unitSize, movInsts, nextIndex, true, m_blhwDebugEnable);
2565 }
2566 }
2567 else // per thread
2568 {
2569 if (beforeFirstThreadArg)
2570 {
2571 beforeFirstThreadArg = false;
2572 }
2573
2574 // add move inst to move from sortedArgs[j]->unitOffsetInPayload + R64 to unitOffsetInPayloadSorted[j]
2575 nextIndex += m_movInstConstructor->ConstructObjMovs(unitOffsetInPayloadSorted[j],
2576 R64_OFFSET + sortedArgs[j]->unitOffsetInPayload - CM_PAYLOAD_OFFSET,
2577 sortedArgs[j]->unitSize, movInsts, nextIndex, true, m_blhwDebugEnable);
2578 }
2579 }
2580
2581 movInstNum = nextIndex;
2582 }
2583
2584 MosSafeDeleteArray(sortedArgs);
2585 MosSafeDeleteArray(unitOffsetInPayloadSorted);
2586 }
2587 }// End of if( m_curbeEnabled && m_ThreadArgExists)
2588
2589 uint32_t addInstDW[4];
2590 MOS_ZeroMemory(addInstDW, CM_MOVE_INSTRUCTION_SIZE);
2591 uint32_t addInstNum =0;
2592
2593 if(m_threadSpace && m_adjustScoreboardY)
2594 {
2595 addInstNum = 1;
2596
2597 addInstDW[0] = CM_BDW_ADJUST_Y_SCOREBOARD_DW0;
2598 addInstDW[1] = CM_BDW_ADJUST_Y_SCOREBOARD_DW1;
2599 addInstDW[2] = CM_BDW_ADJUST_Y_SCOREBOARD_DW2;
2600
2601 // constant word needs high 16 bits to be same as low 16 bits
2602 uint16_t tmp = - (int32_t)(m_adjustScoreboardY);
2603 addInstDW[3] = (tmp << 16) + tmp;
2604
2605 }
2606
2607 if (movInstNum || addInstNum)
2608 {
2609 codeDst = MOS_NewArray(uint8_t, ((movInstNum + addInstNum) * CM_MOVE_INSTRUCTION_SIZE));
2610 if (!codeDst)
2611 {
2612 return CM_OUT_OF_HOST_MEMORY;
2613 }
2614 }
2615
2616 for( uint32_t j = 0; j < movInstNum; j ++ )
2617 {
2618 MovInst_RT* movInst = (MovInst_RT*)movInsts.GetElement( j );
2619 if (!movInst)
2620 {
2621 CM_ASSERTMESSAGE("Error: Invalid move instructions.");
2622 MosSafeDeleteArray(codeDst);
2623 return CM_FAILURE;
2624 }
2625 if (j != 0)
2626 {
2627 movInst->ClearDebug();
2628 }
2629 CmSafeMemCopy(codeDst + j * CM_MOVE_INSTRUCTION_SIZE, movInst->GetBinary(), CM_MOVE_INSTRUCTION_SIZE);
2630 CmSafeDelete(movInst); // delete each element in movInsts
2631 }
2632 movInsts.Delete();
2633
2634 if(addInstNum != 0)
2635 {
2636 CmSafeMemCopy(codeDst + movInstNum * CM_MOVE_INSTRUCTION_SIZE, addInstDW, CM_MOVE_INSTRUCTION_SIZE);
2637
2638 movInstNum += addInstNum; // take add Y instruction into consideration
2639 }
2640
2641 return CM_SUCCESS;
2642 }
2643
CreateKernelArgDataGroup(uint8_t * & data,uint32_t value)2644 int32_t CmKernelRT::CreateKernelArgDataGroup(
2645 uint8_t *&data,
2646 uint32_t value)
2647 {
2648 if (data == nullptr)
2649 {
2650 data = MOS_NewArray(uint8_t, sizeof(uint32_t));
2651 if(!data)
2652 {
2653 return CM_OUT_OF_HOST_MEMORY;
2654 }
2655 }
2656 *(uint32_t *)data = value;
2657 return CM_SUCCESS;
2658 }
2659
CreateKernelImplicitArgDataGroup(uint8_t * & data,uint32_t size)2660 int32_t CmKernelRT::CreateKernelImplicitArgDataGroup(
2661 uint8_t *&data,
2662 uint32_t size)
2663 {
2664 data = MOS_NewArray(uint8_t, (size * sizeof(uint32_t)));
2665 if (!data)
2666 {
2667 return CM_OUT_OF_HOST_MEMORY;
2668 }
2669 *(uint32_t *)data = 0;
2670 return CM_SUCCESS;
2671 }
2672
2673 //*-----------------------------------------------------------------------------
2674 //| Purpose: Create mov instructions
2675 //| instructions will be copied into DstMem
2676 //*-----------------------------------------------------------------------------
CreateThreadArgData(PCM_HAL_KERNEL_ARG_PARAM kernelArg,uint32_t threadArgIndex,CmThreadSpaceRT * threadSpace,CM_ARG * cmArgs)2677 int32_t CmKernelRT::CreateThreadArgData(
2678 PCM_HAL_KERNEL_ARG_PARAM kernelArg,
2679 uint32_t threadArgIndex,
2680 CmThreadSpaceRT* threadSpace,
2681 CM_ARG* cmArgs )
2682 {
2683 int32_t hr = CM_SUCCESS;
2684 uint32_t threadArgCount = cmArgs[ threadArgIndex].unitCount;
2685 uint32_t threadArgSize = cmArgs[ threadArgIndex ].unitSize;
2686
2687 if (CHECK_SURFACE_TYPE(cmArgs->unitKind, ARG_KIND_SURFACE_VME))
2688 {
2689 // reallocate the memory since the number of surfaces in a vme surface could vary
2690 MosSafeDeleteArray(kernelArg->firstValue);
2691 }
2692
2693 if( kernelArg->firstValue == nullptr)
2694 {
2695 // if firstValue = nullptr, then create a new one, otherwise, update the exisitng one
2696 kernelArg->firstValue = MOS_NewArray(uint8_t, (cmArgs[threadArgIndex].unitCount * cmArgs[threadArgIndex].unitSize));
2697 if( !kernelArg->firstValue )
2698 {
2699 hr = CM_OUT_OF_HOST_MEMORY;
2700 goto finish;
2701 }
2702 }
2703
2704 if(kernelArg->unitCount == 1 ) // kernel arg
2705 {
2706 if (cmArgs[threadArgIndex].value)
2707 {
2708 CmSafeMemCopy(kernelArg->firstValue, cmArgs[threadArgIndex].value, threadArgCount * threadArgSize);
2709 }
2710 goto finish;
2711 }
2712
2713 if( threadSpace != nullptr )
2714 {
2715 CM_DEPENDENCY_PATTERN dependencyPatternType = CM_NONE_DEPENDENCY;
2716 threadSpace->GetDependencyPatternType(dependencyPatternType);
2717
2718 if ((m_threadSpaceAssociated == true) && (dependencyPatternType != CM_NONE_DEPENDENCY))
2719 {
2720 CM_THREAD_SPACE_UNIT *threadSpaceUnit = nullptr;
2721 threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
2722
2723 uint32_t *boardOrder = nullptr;
2724 threadSpace->GetBoardOrder(boardOrder);
2725
2726 for (uint32_t index = 0; index < threadArgCount; index++)
2727 {
2728 uint32_t offset = threadSpaceUnit[boardOrder[index]].threadId;
2729 uint8_t *argSrc = (uint8_t*)cmArgs[threadArgIndex].value + offset * threadArgSize;
2730 uint8_t *argDst = kernelArg->firstValue + index * threadArgSize;
2731 CmSafeMemCopy(argDst, argSrc, threadArgSize);
2732 }
2733 }
2734 else
2735 {
2736 CmSafeMemCopy(kernelArg->firstValue, cmArgs[ threadArgIndex ].value, threadArgCount * threadArgSize);
2737 }
2738 }
2739 else
2740 {
2741 CmSafeMemCopy(kernelArg->firstValue, cmArgs[ threadArgIndex ].value, threadArgCount * threadArgSize);
2742 }
2743
2744 finish:
2745 return hr;
2746 }
2747
2748 //*-----------------------------------------------------------------------------
2749 //| Purpose: Sort thread space for scorboarding
2750 //*-----------------------------------------------------------------------------
SortThreadSpace(CmThreadSpaceRT * threadSpace)2751 int32_t CmKernelRT::SortThreadSpace( CmThreadSpaceRT* threadSpace )
2752 {
2753 int32_t hr = CM_SUCCESS;
2754 CM_DEPENDENCY_PATTERN dependencyPatternType = CM_NONE_DEPENDENCY;
2755
2756 CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpace);
2757
2758 threadSpace->GetDependencyPatternType(dependencyPatternType);
2759
2760 if(!threadSpace->IsThreadAssociated())
2761 {//Skip Sort if it is media walker
2762 return CM_SUCCESS;
2763 }
2764
2765 if (threadSpace->CheckDependencyVectorsSet())
2766 {
2767 threadSpace->WavefrontDependencyVectors();
2768 }
2769 else
2770 {
2771 switch (dependencyPatternType)
2772 {
2773 case CM_WAVEFRONT:
2774 threadSpace->Wavefront45Sequence();
2775 break;
2776
2777 case CM_WAVEFRONT26:
2778 threadSpace->Wavefront26Sequence();
2779 break;
2780
2781 case CM_WAVEFRONT26Z:
2782 threadSpace->Wavefront26ZSequence();
2783 break;
2784
2785 case CM_WAVEFRONT26ZI:
2786 CM_26ZI_DISPATCH_PATTERN dispatchPattern;
2787 threadSpace->Get26ZIDispatchPattern(dispatchPattern);
2788 switch (dispatchPattern)
2789 {
2790 case VVERTICAL_HVERTICAL_26:
2791 threadSpace->Wavefront26ZISeqVVHV26();
2792 break;
2793 case VVERTICAL_HHORIZONTAL_26:
2794 threadSpace->Wavefront26ZISeqVVHH26();
2795 break;
2796 case VVERTICAL26_HHORIZONTAL26:
2797 threadSpace->Wavefront26ZISeqVV26HH26();
2798 break;
2799 case VVERTICAL1X26_HHORIZONTAL1X26:
2800 threadSpace->Wavefront26ZISeqVV1x26HH1x26();
2801 break;
2802 default:
2803 threadSpace->Wavefront26ZISeqVVHV26();
2804 break;
2805 }
2806 break;
2807
2808 case CM_HORIZONTAL_WAVE:
2809 threadSpace->HorizentalSequence();
2810 break;
2811
2812 case CM_VERTICAL_WAVE:
2813 threadSpace->VerticalSequence();
2814 break;
2815
2816 case CM_NONE_DEPENDENCY:
2817 case CM_WAVEFRONT26X:
2818 case CM_WAVEFRONT26ZIG:
2819 break;
2820
2821 default:
2822 CM_ASSERTMESSAGE("Error: Invalid thread dependency type.");
2823 hr = CM_FAILURE;
2824 break;
2825 }
2826 }
2827
2828 finish:
2829 return hr;
2830 }
2831
2832 //*-----------------------------------------------------------------------------
2833 //| Purpose: Create temp args array with surface array broken down
2834 //| instructions will be copied into DstMem
2835 //*-----------------------------------------------------------------------------
CreateTempArgs(uint32_t numArgs,CM_ARG * & tempArgs)2836 int32_t CmKernelRT::CreateTempArgs(
2837 uint32_t numArgs,
2838 CM_ARG* &tempArgs)
2839 {
2840 int32_t hr = CM_SUCCESS;
2841 int32_t numSurfaces = 0;
2842 int32_t increasedArgs = 0;
2843
2844 if( numArgs < m_argCount || tempArgs != nullptr )
2845 {
2846 CM_ASSERTMESSAGE("Error: Invalid arg number or arg value.");
2847 hr = CM_FAILURE;
2848 goto finish;
2849 }
2850
2851 tempArgs = MOS_NewArray(CM_ARG, numArgs);
2852 CM_CHK_NULL_GOTOFINISH(tempArgs, CM_OUT_OF_HOST_MEMORY);
2853 CmSafeMemSet(tempArgs, 0, numArgs* sizeof(CM_ARG) );
2854
2855 for( uint32_t j = 0; j < m_argCount; j++ )
2856 {
2857 if ( CHECK_SURFACE_TYPE( m_args[ j ].unitKind, // first time
2858 ARG_KIND_SURFACE,
2859 ARG_KIND_SURFACE_1D,
2860 ARG_KIND_SURFACE_2D,
2861 ARG_KIND_SURFACE_2D_UP,
2862 ARG_KIND_SURFACE_SAMPLER,
2863 ARG_KIND_SURFACE2DUP_SAMPLER,
2864 ARG_KIND_SURFACE_3D,
2865 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
2866 ARG_KIND_SURFACE_SAMPLER8X8_VA,
2867 ARG_KIND_SURFACE_2D_SCOREBOARD,
2868 ARG_KIND_STATE_BUFFER ) )
2869 {
2870 numSurfaces = m_args[j].unitSize/sizeof(int);
2871
2872 if (numSurfaces > 1)
2873 {
2874 if (m_args[j].unitCount == 1)
2875 { //Kernel arg
2876 for (int32_t k = 0; k < numSurfaces; k++)
2877 {
2878 tempArgs[j + increasedArgs + k] = m_args[j];
2879 tempArgs[j + increasedArgs + k].unitSize = sizeof(int32_t);
2880 tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int32_t);
2881 tempArgs[j + increasedArgs + k].value = (uint8_t *)((uint32_t *)m_args[j].value + k);
2882 tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2883 tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2884 //For each surface kind and custom value in surface array
2885 if (!m_args[j].surfIndex[k])
2886 {
2887 //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
2888 //This is for special usage if there is empty element in surface array.
2889 tempArgs[j + increasedArgs + k].unitKind = CM_ARGUMENT_SURFACE2D;
2890 continue;
2891 }
2892 tempArgs[j + increasedArgs + k].unitKind = m_args[j].surfArrayArg[k].argKindForArray;
2893 tempArgs[j + increasedArgs + k].nCustomValue = m_args[j].surfArrayArg[k].addressModeForArray;
2894 }
2895 }
2896 else
2897 {
2898 uint32_t *surfaces = (uint32_t *)MOS_NewArray(uint8_t, ((sizeof(int32_t) * m_args[j].unitCount)));
2899 CM_CHK_NULL_GOTOFINISH(surfaces, CM_OUT_OF_HOST_MEMORY);
2900 for (int32_t k = 0; k < numSurfaces; k++)
2901 {
2902 tempArgs[j + increasedArgs + k] = m_args[j];
2903 tempArgs[j + increasedArgs + k].unitSize = sizeof(int32_t);
2904 tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int32_t);
2905 tempArgs[j + increasedArgs + k].value = MOS_NewArray(uint8_t, ((sizeof(int32_t) * m_args[j].unitCount)));
2906 if(tempArgs[j + increasedArgs + k].value == nullptr)
2907 {
2908 CM_ASSERTMESSAGE("Error: Out of system memory.");
2909 hr = CM_OUT_OF_HOST_MEMORY;
2910 MosSafeDeleteArray(surfaces);
2911 goto finish;
2912 }
2913 for (uint32_t s = 0; s < m_args[j].unitCount; s++)
2914 {
2915 surfaces[s] = *(uint32_t *)((uint32_t *)m_args[j].value + k + numSurfaces * s);
2916 }
2917 CmSafeMemCopy(tempArgs[j + increasedArgs + k].value, surfaces, sizeof(int32_t) * m_args[j].unitCount);
2918 tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2919 tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = (uint16_t)-1;
2920 }
2921 MosSafeDeleteArray(surfaces);
2922 }
2923 increasedArgs += numSurfaces - 1;
2924 }
2925 else
2926 {
2927 tempArgs[j + increasedArgs] = m_args[j];
2928 }
2929 }
2930 else if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
2931 {
2932 numSurfaces = m_args[ j ].unitVmeArraySize;
2933 if(numSurfaces == 1)
2934 { // single vme surface
2935 tempArgs[j + increasedArgs] = m_args[j];
2936 }
2937 else
2938 { // multiple vme surfaces in surface array
2939 if (m_args[j].unitCount == 1) { //Kernel arg
2940 uint32_t vmeSurfOffset = 0;
2941
2942 for (int32_t k = 0; k < numSurfaces; k++)
2943 {
2944 uint16_t vmeSize = (uint16_t)getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[j].value + vmeSurfOffset));
2945
2946 tempArgs[j + increasedArgs + k] = m_args[j];
2947 tempArgs[j + increasedArgs + k].unitSize = vmeSize;
2948 tempArgs[j + increasedArgs + k].unitSizeOrig = vmeSize;
2949 tempArgs[j + increasedArgs + k].value = (uint8_t *)(m_args[j].value + vmeSurfOffset);
2950 tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + k*4;
2951 tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2952
2953 vmeSurfOffset += vmeSize;
2954 }
2955 }
2956 }
2957 increasedArgs += numSurfaces - 1;
2958 }
2959 else if (m_args[j].unitKind == ARG_KIND_SAMPLER)
2960 {
2961 unsigned int numSamplers = m_args[j].unitSize / sizeof(int);
2962
2963 if (numSamplers > 1)
2964 {
2965 if (m_args[j].unitCount == 1)
2966 {
2967 //Kernel arg
2968 for (unsigned int k = 0; k < numSamplers; k++)
2969 {
2970 tempArgs[j + increasedArgs + k] = m_args[j];
2971 tempArgs[j + increasedArgs + k].unitSize = sizeof(int);
2972 tempArgs[j + increasedArgs + k].unitSizeOrig = sizeof(int);
2973 tempArgs[j + increasedArgs + k].value = (unsigned char *)((unsigned int *)m_args[j].value + k);
2974 tempArgs[j + increasedArgs + k].unitOffsetInPayload = m_args[j].unitOffsetInPayload + 4 * k;
2975 tempArgs[j + increasedArgs + k].unitOffsetInPayloadOrig = tempArgs[j + increasedArgs + k].unitOffsetInPayload;
2976 tempArgs[j + increasedArgs + k].unitKind = CM_ARGUMENT_SAMPLER;
2977 }
2978 }
2979 else
2980 {
2981 // Use sampler index array as thread arg.
2982 // Not implemented yet.
2983 return CM_NOT_IMPLEMENTED;
2984 }
2985 increasedArgs += numSamplers - 1;
2986 }
2987 else
2988 {
2989 tempArgs[j + increasedArgs] = m_args[j];
2990 }
2991 }
2992 else
2993 {
2994 tempArgs[j + increasedArgs] = m_args[j];
2995 }
2996 }
2997
2998 finish:
2999 if(hr == CM_OUT_OF_HOST_MEMORY)
3000 {
3001 if(tempArgs)
3002 {
3003 for (uint32_t j = 0; j < numArgs; j++)
3004 {
3005 MosSafeDeleteArray(tempArgs[j].value);
3006 }
3007 }
3008 MosSafeDeleteArray( tempArgs );
3009 }
3010 return hr;
3011 }
3012
3013 //*-----------------------------------------------------------------------------
3014 //| Purpose: Get the number of args includes the num of surfaces in surface array
3015 //*-----------------------------------------------------------------------------
GetArgCountPlusSurfArray(uint32_t & argSize,uint32_t & argCountPlus)3016 int32_t CmKernelRT::GetArgCountPlusSurfArray(uint32_t &argSize, uint32_t & argCountPlus)
3017 {
3018 argCountPlus = m_argCount;
3019 argSize = 0;
3020
3021 if(m_usKernelPayloadDataSize)
3022 { // if payload data exists, the number of args is zero
3023 argCountPlus = 0;
3024 argSize = 0;
3025 return CM_SUCCESS;
3026 }
3027
3028 if( m_argCount != 0 ) //Need pass the arg either by arguments area, or by indirect payload area
3029 {
3030 //Sanity check for argument setting
3031 if((m_perThreadArgExists == false) && (m_perKernelArgExists == false) && (m_usKernelPayloadDataSize == 0))
3032 {
3033 if ( m_stateBufferBounded == CM_STATE_BUFFER_NONE )
3034 {
3035 CM_ASSERTMESSAGE( "Error: Kernel arguments are not set." );
3036 return CM_NOT_SET_KERNEL_ARGUMENT;
3037 }
3038 }
3039
3040 if(m_perThreadArgExists || m_perKernelArgExists)
3041 {
3042 unsigned int extraArgs = 0;
3043
3044 for( uint32_t j = 0; j < m_argCount; j ++ )
3045 {
3046 //Sanity checking for every argument setting
3047 if ( !m_args[j].isSet )
3048 {
3049 CM_ASSERTMESSAGE("Error: One Kernel argument is not set.");
3050 return CM_KERNEL_ARG_SETTING_FAILED;
3051 }
3052
3053 argSize += m_args[j].unitSize * m_args[j].unitCount;
3054
3055 if ( CHECK_SURFACE_TYPE( m_args[ j ].unitKind,
3056 ARG_KIND_SURFACE,
3057 ARG_KIND_SURFACE_1D,
3058 ARG_KIND_SURFACE_2D,
3059 ARG_KIND_SURFACE_2D_UP,
3060 ARG_KIND_SURFACE_SAMPLER,
3061 ARG_KIND_SURFACE2DUP_SAMPLER,
3062 ARG_KIND_SURFACE_3D,
3063 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
3064 ARG_KIND_SURFACE_SAMPLER8X8_VA,
3065 ARG_KIND_SURFACE_2D_SCOREBOARD,
3066 ARG_KIND_STATE_BUFFER ) )
3067 {
3068 int numSurfaces = m_args[j].unitSize/sizeof(int);
3069 if (numSurfaces > 1) {
3070 extraArgs += numSurfaces - 1;
3071 }
3072 }
3073 else if (CHECK_SURFACE_TYPE(m_args[j].unitKind, ARG_KIND_SURFACE_VME))
3074 {
3075 int numSurfaces = m_args[j].unitVmeArraySize;
3076 if (numSurfaces > 1) {
3077 extraArgs += numSurfaces - 1;
3078 }
3079 }
3080 else if (m_args[j].unitKind == ARG_KIND_SAMPLER)
3081 {
3082 int numSamplers = m_args[j].unitSize / sizeof(int);
3083 if (numSamplers > 1)
3084 {
3085 extraArgs += (numSamplers - 1);
3086 }
3087 }
3088 }
3089
3090 argCountPlus = m_argCount + extraArgs;
3091 }
3092 }
3093 return CM_SUCCESS;
3094 }
3095
3096 //*-----------------------------------------------------------------------------
3097 //| Purpose: Create Thread Space Param
3098 //*-----------------------------------------------------------------------------
CreateThreadSpaceParam(PCM_HAL_KERNEL_THREADSPACE_PARAM kernelThreadSpaceParam,CmThreadSpaceRT * threadSpace)3099 int32_t CmKernelRT::CreateThreadSpaceParam(
3100 PCM_HAL_KERNEL_THREADSPACE_PARAM kernelThreadSpaceParam,
3101 CmThreadSpaceRT* threadSpace )
3102 {
3103 int32_t hr = CM_SUCCESS;
3104 CM_HAL_DEPENDENCY* dependency = nullptr;
3105 uint32_t threadSpaceWidth = 0;
3106 uint32_t threadSpaceHeight =0;
3107 CM_THREAD_SPACE_UNIT *threadSpaceUnit = nullptr;
3108 CM_THREAD_SPACE_DIRTY_STATUS dirtyStatus = CM_THREAD_SPACE_CLEAN;
3109
3110 if (kernelThreadSpaceParam == nullptr || threadSpace == nullptr)
3111 {
3112 CM_ASSERTMESSAGE("Error: Pointer to CmKernelThreadSpaceParam or thread space is null.");
3113 hr = CM_NULL_POINTER;
3114 goto finish;
3115 }
3116
3117 threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
3118 kernelThreadSpaceParam->threadSpaceWidth = (uint16_t)threadSpaceWidth;
3119 kernelThreadSpaceParam->threadSpaceHeight = (uint16_t)threadSpaceHeight;
3120
3121 threadSpace->GetDependencyPatternType(kernelThreadSpaceParam->patternType);
3122 threadSpace->GetWalkingPattern(kernelThreadSpaceParam->walkingPattern);
3123 threadSpace->GetDependency( dependency);
3124
3125 if(dependency != nullptr)
3126 {
3127 CmSafeMemCopy(&kernelThreadSpaceParam->dependencyInfo, dependency, sizeof(CM_HAL_DEPENDENCY));
3128 }
3129
3130 if( threadSpace->CheckWalkingParametersSet( ) )
3131 {
3132 kernelThreadSpaceParam->walkingParamsValid = 1;
3133 CM_CHK_CMSTATUS_GOTOFINISH(threadSpace->GetWalkingParameters(kernelThreadSpaceParam->walkingParams));
3134 }
3135 else
3136 {
3137 kernelThreadSpaceParam->walkingParamsValid = 0;
3138 }
3139
3140 if( threadSpace->CheckDependencyVectorsSet( ) )
3141 {
3142 kernelThreadSpaceParam->dependencyVectorsValid = 1;
3143 CM_CHK_CMSTATUS_GOTOFINISH(threadSpace->GetDependencyVectors(kernelThreadSpaceParam->dependencyVectors));
3144 }
3145 else
3146 {
3147 kernelThreadSpaceParam->dependencyVectorsValid = 0;
3148 }
3149
3150 threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
3151
3152 if(threadSpaceUnit)
3153 {
3154 kernelThreadSpaceParam->threadCoordinates = MOS_NewArray(CM_HAL_SCOREBOARD, (threadSpaceWidth * threadSpaceHeight));
3155 CM_CHK_NULL_GOTOFINISH(kernelThreadSpaceParam->threadCoordinates , CM_OUT_OF_HOST_MEMORY);
3156 CmSafeMemSet(kernelThreadSpaceParam->threadCoordinates, 0, threadSpaceHeight * threadSpaceWidth * sizeof(CM_HAL_SCOREBOARD));
3157
3158 uint32_t *boardOrder = nullptr;
3159 threadSpace->GetBoardOrder(boardOrder);
3160 CM_CHK_NULL_GOTOFINISH_CMERROR(boardOrder);
3161
3162 kernelThreadSpaceParam->reuseBBUpdateMask = 0;
3163 for(uint32_t i=0; i< threadSpaceWidth * threadSpaceHeight ; i++)
3164 {
3165 kernelThreadSpaceParam->threadCoordinates[i].x = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.x;
3166 kernelThreadSpaceParam->threadCoordinates[i].y = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.y;
3167 kernelThreadSpaceParam->threadCoordinates[i].mask = threadSpaceUnit[boardOrder[i]].dependencyMask;
3168 kernelThreadSpaceParam->threadCoordinates[i].resetMask= threadSpaceUnit[boardOrder[i]].reset;
3169 kernelThreadSpaceParam->threadCoordinates[i].color = threadSpaceUnit[boardOrder[i]].scoreboardColor;
3170 kernelThreadSpaceParam->threadCoordinates[i].sliceSelect = threadSpaceUnit[boardOrder[i]].sliceDestinationSelect;
3171 kernelThreadSpaceParam->threadCoordinates[i].subSliceSelect = threadSpaceUnit[boardOrder[i]].subSliceDestinationSelect;
3172 kernelThreadSpaceParam->reuseBBUpdateMask |= threadSpaceUnit[boardOrder[i]].reset;
3173 }
3174
3175 if( kernelThreadSpaceParam->patternType == CM_WAVEFRONT26Z )
3176 {
3177 CM_HAL_WAVEFRONT26Z_DISPATCH_INFO dispatchInfo;
3178 threadSpace->GetWavefront26ZDispatchInfo(dispatchInfo);
3179
3180 kernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
3181 kernelThreadSpaceParam->dispatchInfo.numThreadsInWave = MOS_NewArray(uint32_t, dispatchInfo.numWaves);
3182 CM_CHK_NULL_GOTOFINISH(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave, CM_OUT_OF_HOST_MEMORY);
3183 CmSafeMemCopy(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave,
3184 dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
3185
3186 }
3187 }
3188
3189 //Get group select setting information
3190 threadSpace->GetMediaWalkerGroupSelect(kernelThreadSpaceParam->groupSelect);
3191
3192 //Get color count
3193 threadSpace->GetColorCountMinusOne(kernelThreadSpaceParam->colorCountMinusOne);
3194
3195 dirtyStatus = threadSpace->GetDirtyStatus();
3196 switch (dirtyStatus)
3197 {
3198 case CM_THREAD_SPACE_CLEAN:
3199 kernelThreadSpaceParam->bbDirtyStatus = CM_HAL_BB_CLEAN;
3200 break;
3201 default:
3202 kernelThreadSpaceParam->bbDirtyStatus = CM_HAL_BB_DIRTY;
3203 break;
3204 }
3205
3206 finish:
3207 if( hr == CM_OUT_OF_HOST_MEMORY)
3208 {
3209 if( kernelThreadSpaceParam )
3210 {
3211 MosSafeDeleteArray(kernelThreadSpaceParam->dispatchInfo.numThreadsInWave);
3212 MosSafeDeleteArray(kernelThreadSpaceParam->threadCoordinates);
3213 }
3214 }
3215
3216 return hr;
3217 }
3218
3219 //*-----------------------------------------------------------------------------
3220 //| Purpose: Delete the args array
3221 //*-----------------------------------------------------------------------------
DestroyArgs(void)3222 int32_t CmKernelRT::DestroyArgs( void )
3223 {
3224 for( uint32_t i =0 ; i < m_argCount; i ++ )
3225 {
3226 CM_ARG& arg = m_args[ i ];
3227 MosSafeDeleteArray( arg.value );
3228 MosSafeDeleteArray(arg.surfIndex);
3229 MosSafeDeleteArray(arg.surfArrayArg);
3230 arg.unitCount = 0;
3231 arg.unitSize = 0;
3232 arg.unitKind = 0;
3233 arg.unitOffsetInPayload = 0;
3234 arg.isDirty = true;
3235 arg.isSet = false;
3236 }
3237
3238 MosSafeDeleteArray( m_args );
3239
3240 m_threadSpaceAssociated = false;
3241 m_threadSpace = nullptr;
3242
3243 m_perThreadArgExists = false;
3244 m_perKernelArgExists = false;
3245
3246 m_sizeInCurbe = 0;
3247 m_curbeEnabled = true;
3248
3249 m_sizeInPayload = 0;
3250 m_adjustScoreboardY = 0;
3251
3252 ResetKernelSurfaces();
3253
3254 return CM_SUCCESS;
3255 }
3256
3257 //*-----------------------------------------------------------------------------
3258 // Calling reset makes it possible to change the per kernel or per thread
3259 // property of the argurments b/c it reset releases the memory for arguments
3260 //*-----------------------------------------------------------------------------
Reset(void)3261 int32_t CmKernelRT::Reset( void )
3262 {
3263 for( uint32_t i =0 ; i < m_argCount; i ++ )
3264 {
3265 CM_ARG& arg = m_args[ i ];
3266 MosSafeDeleteArray( arg.value );
3267 MosSafeDeleteArray( arg.surfIndex);
3268 MosSafeDeleteArray(arg.surfArrayArg);
3269 arg.value = nullptr;
3270 arg.unitCount = 0;
3271
3272 arg.unitSize = arg.unitSizeOrig;
3273 arg.unitKind = arg.unitKindOrig;
3274 arg.unitOffsetInPayload = arg.unitOffsetInPayloadOrig;
3275
3276 arg.isDirty = true;
3277 arg.isSet = false;
3278 arg.unitVmeArraySize = 0;
3279
3280 arg.isStatelessBuffer = false;
3281 arg.index = 0;
3282 }
3283
3284 m_threadCount = 0;
3285
3286 m_indexInTask = 0;
3287
3288 m_perThreadArgExists = false;
3289 m_perKernelArgExists = false;
3290
3291 m_sizeInCurbe = 0;
3292 m_curbeEnabled = true;
3293
3294 m_sizeInPayload = 0;
3295
3296 m_threadSpaceAssociated = false;
3297 m_threadSpace = nullptr;
3298 m_adjustScoreboardY = 0;
3299
3300 m_threadGroupSpace = nullptr;
3301
3302 MosSafeDeleteArray(m_kernelPayloadData);
3303 m_usKernelPayloadDataSize = 0;
3304
3305 if (m_usKernelPayloadSurfaceCount)
3306 {
3307 CmSafeMemSet(m_pKernelPayloadSurfaceArray, 0, m_usKernelPayloadSurfaceCount * sizeof(SurfaceIndex *));
3308 CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
3309 m_usKernelPayloadSurfaceCount = 0;
3310 }
3311
3312 ResetKernelSurfaces();
3313
3314 return CM_SUCCESS;
3315 }
3316
3317 //*-----------------------------------------------------------------------------
3318 //| Purpose: Get the pointer to arguments array
3319 //*-----------------------------------------------------------------------------
GetArgs(CM_ARG * & arg)3320 int32_t CmKernelRT::GetArgs( CM_ARG* & arg )
3321 {
3322 arg = m_args;
3323 return CM_SUCCESS;
3324 }
3325
3326 //*-----------------------------------------------------------------------------
3327 //| Purpose: Get the arguments' count
3328 //*-----------------------------------------------------------------------------
GetArgCount(uint32_t & argCount)3329 int32_t CmKernelRT::GetArgCount( uint32_t & argCount )
3330 {
3331 argCount = m_argCount;
3332 return CM_SUCCESS;
3333 }
3334
3335 //*-----------------------------------------------------------------------------
3336 //| Purpose: Get the value of member CurbeEnable
3337 //*-----------------------------------------------------------------------------
GetCurbeEnable(bool & b)3338 int32_t CmKernelRT::GetCurbeEnable( bool& b )
3339 {
3340 b = m_curbeEnabled;
3341 return CM_SUCCESS;
3342 }
3343
3344 //*-----------------------------------------------------------------------------
3345 //| Purpose: Set the CurbeEnable member
3346 //*-----------------------------------------------------------------------------
SetCurbeEnable(bool b)3347 int32_t CmKernelRT::SetCurbeEnable( bool b )
3348 {
3349 m_curbeEnabled = b;
3350 return CM_SUCCESS;
3351 }
3352
3353 //*-----------------------------------------------------------------------------
3354 //| Purpose: Get the kernel's size in Curbe
3355 //*-----------------------------------------------------------------------------
GetSizeInCurbe(uint32_t & size)3356 int32_t CmKernelRT::GetSizeInCurbe( uint32_t& size )
3357 {
3358 size = m_sizeInCurbe;
3359 return CM_SUCCESS;
3360 }
3361
3362 //*-----------------------------------------------------------------------------
3363 //| Purpose: Get the total size in payload of meida object or media walker
3364 //*-----------------------------------------------------------------------------
GetSizeInPayload(uint32_t & size)3365 int32_t CmKernelRT::GetSizeInPayload( uint32_t& size )
3366 {
3367 size = m_sizeInPayload;
3368 return CM_SUCCESS;
3369 }
3370
3371 //*-----------------------------------------------------------------------------
3372 //| Purpose: Get the pointer to CM device
3373 //*-----------------------------------------------------------------------------
GetCmDevice(CmDeviceRT * & device)3374 int32_t CmKernelRT::GetCmDevice(CmDeviceRT* &device)
3375 {
3376 device = m_device;
3377 return CM_SUCCESS;
3378 }
3379
GetCmProgram(CmProgramRT * & program)3380 int32_t CmKernelRT::GetCmProgram( CmProgramRT* & program )
3381 {
3382 program = m_program;
3383 return CM_SUCCESS;
3384 }
3385
CollectKernelSurface()3386 int32_t CmKernelRT::CollectKernelSurface()
3387 {
3388 m_vmeSurfaceCount = 0;
3389 m_maxSurfaceIndexAllocated = 0;
3390
3391 for( uint32_t j = 0; j < m_argCount; j ++ )
3392 {
3393 if ((m_args[ j ].unitKind == ARG_KIND_SURFACE ) || // first time
3394 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_1D ) ||
3395 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D ) ||
3396 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D_UP ) ||
3397 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER ) ||
3398 ( m_args[ j ].unitKind == ARG_KIND_SURFACE2DUP_SAMPLER ) ||
3399 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_3D ) ||
3400 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_AVS) ||
3401 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_SAMPLER8X8_VA) ||
3402 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_VME ) ||
3403 ( m_args[ j ].unitKind == ARG_KIND_SURFACE_2D_SCOREBOARD) ||
3404 ( m_args[ j ].unitKind == ARG_KIND_STATE_BUFFER ) )
3405 {
3406 int numSurfaces;
3407 int numValidSurfaces = 0;
3408
3409 if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
3410 {
3411 numSurfaces = getSurfNumFromArgArraySize(m_args[j].unitSize, m_args[j].unitVmeArraySize);
3412 }
3413 else
3414 {
3415 numSurfaces = m_args[j].unitSize/sizeof(int);
3416 }
3417
3418 for (uint32_t k = 0; k < numSurfaces * m_args[j].unitCount; k ++)
3419 {
3420 uint16_t surfIndex = 0;
3421 if (m_args[j].surfIndex)
3422 {
3423 surfIndex = m_args[j].surfIndex[k];
3424 }
3425 if (surfIndex != 0 && surfIndex != CM_NULL_SURFACE)
3426 {
3427 m_surfaceArray[surfIndex] = true;
3428 numValidSurfaces ++;
3429 m_maxSurfaceIndexAllocated = Max(m_maxSurfaceIndexAllocated, surfIndex);
3430 }
3431 }
3432 if (m_args[ j ].unitKind == ARG_KIND_SURFACE_VME)
3433 {
3434 m_vmeSurfaceCount += numValidSurfaces;
3435 }
3436 }
3437
3438 if (m_args[ j ].isStatelessBuffer)
3439 {
3440 uint32_t surfIndex = m_args[j].index;
3441 m_surfaceArray[surfIndex] = true;
3442 }
3443 }
3444
3445 for( int32_t i=0; i < CM_GLOBAL_SURFACE_NUMBER; ++i )
3446 {
3447 if( m_globalSurfaces[i] != nullptr )
3448 {
3449 uint32_t surfIndex = m_globalCmIndex[i];
3450 m_surfaceArray[surfIndex] = true;
3451 }
3452 }
3453
3454 for (int32_t i = 0; i < m_usKernelPayloadSurfaceCount; i++)
3455 {
3456 if (m_pKernelPayloadSurfaceArray[i] != nullptr)
3457 {
3458 uint32_t surfIndex = m_pKernelPayloadSurfaceArray[i]->get_data();
3459 m_surfaceArray[surfIndex] = true;
3460 }
3461 }
3462
3463 return CM_SUCCESS;
3464 }
3465
IsKernelDataReusable(CmThreadSpaceRT * threadSpace)3466 int32_t CmKernelRT::IsKernelDataReusable( CmThreadSpaceRT* threadSpace)
3467 {
3468 if(threadSpace)
3469 {
3470 if(threadSpace->IsThreadAssociated() && (threadSpace->GetDirtyStatus()!= CM_THREAD_SPACE_CLEAN))
3471 {
3472 return false;
3473 }
3474 }
3475
3476 if(m_threadSpace)
3477 {
3478 if(m_threadSpace->GetDirtyStatus()!= CM_THREAD_SPACE_CLEAN)
3479 {
3480 return false;
3481 }
3482 }
3483
3484 if(m_dirty != CM_KERNEL_DATA_CLEAN)
3485 {
3486 return false;
3487 }
3488
3489 return true;
3490 }
3491
3492 //*-----------------------------------------------------------------------------
3493 //| Purpose: Prepare Kernel Data including thread args, kernel args
3494 //| Returns: Result of the operation.
3495 //*-----------------------------------------------------------------------------
CreateKernelData(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadSpaceRT * threadSpace)3496 int32_t CmKernelRT::CreateKernelData(
3497 CmKernelData* & kernelData, // out
3498 uint32_t& kernelDataSize, // out
3499 const CmThreadSpaceRT* threadSpace ) // in
3500 {
3501 int32_t hr = CM_SUCCESS;
3502 PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
3503
3504 if( (threadSpace != nullptr) && (m_threadSpace != nullptr) )
3505 {
3506 // per-kernel threadspace and per-task threadspace cannot be set at the same time
3507 return CM_INVALID_THREAD_SPACE;
3508 }
3509
3510 if(m_lastKernelData == nullptr)
3511 {
3512 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3513 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3514 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3515 }
3516 else
3517 {
3518 if(IsKernelDataReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
3519 {
3520 // nothing changed; Reuse m_lastKernelData
3521 kernelData = m_lastKernelData;
3522 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3523 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3524 kernelDataSize = kernelData->GetKernelDataSize();
3525
3526 if (m_threadSpace)
3527 {
3528 halKernelParam = kernelData->GetHalCmKernelData();
3529 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
3530 // need to set to clean here because CmThreadSpaceParam.BBdirtyStatus is only set in CreateKernelDataInternal
3531 // flag used to re-use batch buffer, don't care if BB is busy if it is "clean"
3532 halKernelParam->kernelThreadSpaceParam.bbDirtyStatus = CM_HAL_BB_CLEAN;
3533 }
3534 }
3535 else
3536 {
3537 if(m_lastKernelData->IsInUse())
3538 { // Need to Create a new one , if the kernel data is in use
3539 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3540 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3541 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3542 }
3543 else if(threadSpace && threadSpace->IsThreadAssociated() && (threadSpace->GetDirtyStatus() != CM_THREAD_SPACE_CLEAN))
3544 { // if thread space is assocaited , don't support reuse
3545 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3546 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3547 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3548 }
3549 else if(m_dirty < CM_KERNEL_DATA_THREAD_COUNT_DIRTY || // Kernel arg or thread arg dirty
3550 (m_threadSpace && m_threadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DEPENDENCY_MASK_DIRTY))
3551 {
3552 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelData(m_lastKernelData,threadSpace));
3553 kernelData = m_lastKernelData;
3554 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3555 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3556 kernelDataSize = kernelData->GetKernelDataSize();
3557
3558 }
3559 else
3560 {
3561 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, threadSpace));
3562 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3563 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3564 }
3565 }
3566 }
3567
3568 CleanArgDirtyFlag();
3569 if(threadSpace)
3570 {
3571 threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3572 }
3573 if (m_threadSpace)
3574 {
3575 m_threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3576 }
3577
3578 finish:
3579 return hr;
3580 }
3581
GetName()3582 char* CmKernelRT::GetName() { return (char*)m_kernelInfo->kernelName; }
3583
3584 //*-----------------------------------------------------------------------------
3585 //| Purpose: Create Kernel Data
3586 //| Returns: Result of the operation.
3587 //*-----------------------------------------------------------------------------
CreateKernelData(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadGroupSpace * threadGroupSpace)3588 int32_t CmKernelRT::CreateKernelData(
3589 CmKernelData* & kernelData, // out
3590 uint32_t& kernelDataSize, // out
3591 const CmThreadGroupSpace* threadGroupSpace ) // in
3592 {
3593 int32_t hr = CM_SUCCESS;
3594 CmThreadGroupSpace* usedThreadGroupSpace = nullptr;
3595
3596 //If kernel has associated TGS, we will use it, instead of per-task TGS
3597 if (m_threadGroupSpace)
3598 {
3599 usedThreadGroupSpace = m_threadGroupSpace;
3600 }
3601 else
3602 {
3603 usedThreadGroupSpace = const_cast<CmThreadGroupSpace*>(threadGroupSpace);
3604 }
3605
3606 if(m_lastKernelData == nullptr)
3607 {
3608 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, usedThreadGroupSpace));
3609 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3610 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3611 }
3612 else
3613 {
3614 if (!((m_dirty & CM_KERNEL_DATA_KERNEL_ARG_DIRTY) || (m_dirty & CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY)))
3615 {
3616 // nothing changed; Reuse m_lastKernelData
3617 kernelData = m_lastKernelData;
3618 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3619 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3620 kernelDataSize = kernelData->GetKernelDataSize();
3621 }
3622 else
3623 {
3624 if(m_lastKernelData->IsInUse())
3625 { // Need to Clone a new one
3626 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelDataInternal(kernelData, kernelDataSize, usedThreadGroupSpace));
3627 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel/program's ref count
3628 CM_CHK_CMSTATUS_GOTOFINISH(UpdateLastKernelData(kernelData));
3629 }
3630 else
3631 {
3632 // change happend -> Reuse m_lastKernelData but need to change its content accordingly
3633 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelData(m_lastKernelData, usedThreadGroupSpace));
3634 kernelData = m_lastKernelData;
3635 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelData(kernelData));
3636 CM_CHK_CMSTATUS_GOTOFINISH(AcquireKernelProgram()); // increase kernel and program's ref count
3637 kernelDataSize = kernelData->GetKernelDataSize();
3638 }
3639 }
3640 }
3641
3642 CleanArgDirtyFlag();
3643
3644 finish:
3645 return hr;
3646 }
3647
CleanArgDirtyFlag()3648 int32_t CmKernelRT::CleanArgDirtyFlag()
3649 {
3650
3651 for(uint32_t i =0 ; i< m_argCount; i++)
3652 {
3653 m_args[i].isDirty = false;
3654 }
3655
3656 if(m_threadSpace && m_threadSpace->GetDirtyStatus())
3657 {
3658 m_threadSpace->SetDirtyStatus(CM_THREAD_SPACE_CLEAN);
3659 }
3660
3661 m_dirty = CM_KERNEL_DATA_CLEAN;
3662
3663 return CM_SUCCESS;
3664 }
3665
3666 //*-----------------------------------------------------------------------------
3667 //| Purpose: Update the global surface and gtpin surface info to kernel data
3668 //| Returns: Result of the operation.
3669 //*-----------------------------------------------------------------------------
UpdateKernelDataGlobalSurfaceInfo(PCM_HAL_KERNEL_PARAM halKernelParam)3670 int32_t CmKernelRT::UpdateKernelDataGlobalSurfaceInfo( PCM_HAL_KERNEL_PARAM halKernelParam )
3671 {
3672 int32_t hr = CM_SUCCESS;
3673
3674 //global surface
3675 for ( uint32_t j = 0; j < CM_GLOBAL_SURFACE_NUMBER; j++ )
3676 {
3677 if ( m_globalSurfaces[ j ] != nullptr )
3678 {
3679 halKernelParam->globalSurface[ j ] = m_globalSurfaces[ j ]->get_data();
3680 halKernelParam->globalSurfaceUsed = true;
3681 }
3682 else
3683 {
3684 halKernelParam->globalSurface[ j ] = CM_NULL_SURFACE;
3685 }
3686 }
3687
3688 for ( uint32_t j = CM_GLOBAL_SURFACE_NUMBER; j < CM_MAX_GLOBAL_SURFACE_NUMBER; j++ )
3689 {
3690 halKernelParam->globalSurface[ j ] = CM_NULL_SURFACE;
3691 }
3692 #if USE_EXTENSION_CODE
3693 UpdateKernelDataGTPinSurfaceInfo(halKernelParam);
3694 #endif
3695
3696 return hr;
3697 }
3698
3699 //*-----------------------------------------------------------------------------
3700 //| Purpose: Prepare Kernel Data including thread args, kernel args
3701 //| Returns: Result of the operation.
3702 //*-----------------------------------------------------------------------------
CreateKernelDataInternal(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadGroupSpace * threadGroupSpace)3703 int32_t CmKernelRT::CreateKernelDataInternal(
3704 CmKernelData* & kernelData, // out
3705 uint32_t& kernelDataSize, // out
3706 const CmThreadGroupSpace* threadGroupSpace) // in
3707 {
3708 PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
3709 int32_t hr = CM_SUCCESS;
3710 uint32_t movInstNum = 0;
3711 uint32_t kernelCurbeSize = 0;
3712 uint32_t numArgs = 0;
3713 CM_ARG *tempArgs = nullptr;
3714 uint32_t argSize = 0;
3715 uint32_t surfNum = 0; //Pass needed BT entry numbers to HAL CM
3716 CmKernelRT *cmKernel = nullptr;
3717 uint32_t minKernelPlayloadOffset = 0;
3718 bool adjustLocalIdPayloadOffset = false;
3719
3720 CM_CHK_CMSTATUS_GOTOFINISH(CmKernelData::Create(this, kernelData));
3721 halKernelParam = kernelData->GetHalCmKernelData();
3722 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
3723
3724 //Get Num of args with surface array
3725 CM_CHK_CMSTATUS_GOTOFINISH(GetArgCountPlusSurfArray(argSize, numArgs));
3726
3727 //Create Temp args
3728 CM_CHK_CMSTATUS_GOTOFINISH(CreateTempArgs(numArgs, tempArgs));
3729
3730 //Create move instructions
3731 CM_CHK_CMSTATUS_GOTOFINISH(CreateMovInstructions(movInstNum, halKernelParam->movInsData, tempArgs, numArgs));
3732 CM_CHK_CMSTATUS_GOTOFINISH(CalcKernelDataSize(movInstNum, numArgs, argSize, kernelDataSize));
3733 CM_CHK_CMSTATUS_GOTOFINISH(kernelData->SetKernelDataSize(kernelDataSize));
3734
3735 halKernelParam->clonedKernelParam.isClonedKernel = m_isClonedKernel;
3736 halKernelParam->clonedKernelParam.kernelID = m_cloneKernelID;
3737 halKernelParam->clonedKernelParam.hasClones = m_hasClones;
3738
3739 halKernelParam->kernelId = m_id++;
3740 if ((m_program->m_cisaMajorVersion >= 3 && m_program->m_cisaMinorVersion >= 3))
3741 halKernelParam->numArgs = numArgs;
3742 else
3743 halKernelParam->numArgs = numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM;
3744 halKernelParam->numThreads = m_threadCount;
3745 halKernelParam->kernelBinarySize = m_binarySize + movInstNum * CM_MOVE_INSTRUCTION_SIZE;
3746 halKernelParam->kernelDataSize = kernelDataSize;
3747 halKernelParam->movInsDataSize = movInstNum * CM_MOVE_INSTRUCTION_SIZE;
3748 halKernelParam->kernelDebugEnabled = m_blhwDebugEnable;
3749
3750 halKernelParam->cmFlags = m_curbeEnabled ? CM_FLAG_CURBE_ENABLED : 0;
3751 halKernelParam->cmFlags |= m_nonstallingScoreboardEnabled ? CM_FLAG_NONSTALLING_SCOREBOARD_ENABLED : 0;
3752
3753 halKernelParam->kernelBinary = (uint8_t*)m_binary;
3754
3755 CM_CHK_CMSTATUS_GOTOFINISH(kernelData->GetCmKernel(cmKernel));
3756 if (cmKernel == nullptr)
3757 {
3758 return CM_NULL_POINTER;
3759 }
3760 MOS_SecureStrcpy(halKernelParam->kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE, cmKernel->GetName());
3761
3762 uint32_t thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth;
3763 threadGroupSpace->GetThreadGroupSpaceSize(thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth);
3764
3765 for (uint32_t i = 0; i < numArgs; i++)
3766 {
3767 // get the min kernel payload offset
3768 if ((halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE) && IsKernelArg(tempArgs[i]))
3769 {
3770 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3771 {
3772 if (minKernelPlayloadOffset == 0 || minKernelPlayloadOffset > tempArgs[i].unitOffsetInPayload)
3773 {
3774 minKernelPlayloadOffset = tempArgs[i].unitOffsetInPayload;
3775 }
3776 }
3777 else
3778 {
3779 if ((minKernelPlayloadOffset == 0 || minKernelPlayloadOffset > tempArgs[i].unitOffsetInPayload) && (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID))
3780 {
3781 minKernelPlayloadOffset = tempArgs[i].unitOffsetInPayload;
3782 }
3783 }
3784 }
3785 }
3786
3787 for (uint32_t i = 0; i < numArgs; i++)
3788 {
3789 halKernelParam->argParams[i].unitCount = tempArgs[i].unitCount;
3790 halKernelParam->argParams[i].kind = (CM_HAL_KERNEL_ARG_KIND)(tempArgs[i].unitKind);
3791 halKernelParam->argParams[i].unitSize = tempArgs[i].unitSize;
3792 halKernelParam->argParams[i].payloadOffset = tempArgs[i].unitOffsetInPayload;
3793 halKernelParam->argParams[i].perThread = false;
3794 halKernelParam->argParams[i].nCustomValue = tempArgs[i].nCustomValue;
3795 halKernelParam->argParams[i].aliasIndex = tempArgs[i].aliasIndex;
3796 halKernelParam->argParams[i].aliasCreated = tempArgs[i].aliasCreated;
3797 halKernelParam->argParams[i].isNull = tempArgs[i].isNull;
3798
3799 if (tempArgs[i].unitKind == CM_ARGUMENT_IMPLICT_LOCALSIZE) {
3800 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3801 *(uint32_t *)halKernelParam->argParams[i].firstValue = thrdSpaceWidth;
3802 *(uint32_t *)(halKernelParam->argParams[i].firstValue + 4) = thrdSpaceHeight;
3803 *(uint32_t *)(halKernelParam->argParams[i].firstValue + 8) = thrdSpaceDepth;
3804 }
3805 else if (tempArgs[i].unitKind == CM_ARGUMENT_IMPLICT_GROUPSIZE) {
3806 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3807 *(uint32_t *)halKernelParam->argParams[i].firstValue = grpSpaceWidth;
3808 *(uint32_t *)(halKernelParam->argParams[i].firstValue + 4) = grpSpaceHeight;
3809 *(uint32_t *)(halKernelParam->argParams[i].firstValue + 8) = grpSpaceDepth;
3810 }
3811 else if (tempArgs[i].unitKind == ARG_KIND_IMPLICIT_LOCALID) {
3812 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelImplicitArgDataGroup(halKernelParam->argParams[i].firstValue, 3));
3813 halKernelParam->localIdIndex = i;
3814 }
3815 else
3816 CreateThreadArgData(&halKernelParam->argParams[i], i, nullptr, tempArgs);
3817
3818 if (halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE)
3819 {
3820 if (IsKernelArg(halKernelParam->argParams[i]))
3821 {
3822 // Kernel arg : calculate curbe size & adjust payloadoffset
3823 if (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID)
3824 {
3825 halKernelParam->argParams[i].payloadOffset -= minKernelPlayloadOffset;
3826 }
3827 else
3828 {
3829 // ARG_KIND_IMPLICIT_LOCALID is only for visa3.3+, need to adjust payloadOffset of local id for visa3.3+ later.
3830 adjustLocalIdPayloadOffset = true;
3831 }
3832
3833 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3834 if ((halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize > kernelCurbeSize))
3835 { // The largest one
3836 kernelCurbeSize = halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize;
3837 }
3838 }
3839 else
3840 {
3841 if ((halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize > kernelCurbeSize) && (tempArgs[i].unitKind != ARG_KIND_IMPLICIT_LOCALID))
3842 { // The largest one
3843 kernelCurbeSize = halKernelParam->argParams[i].payloadOffset + halKernelParam->argParams[i].unitSize;
3844 }
3845 }
3846 }
3847 }
3848 }
3849
3850 if ( m_stateBufferBounded != CM_STATE_BUFFER_NONE )
3851 {
3852 PCM_CONTEXT_DATA cmData = ( PCM_CONTEXT_DATA )m_device->GetAccelData();
3853 PCM_HAL_STATE state = cmData->cmHalState;
3854 kernelCurbeSize = state->pfnGetStateBufferSizeForKernel( state, this );
3855 halKernelParam->stateBufferType = state->pfnGetStateBufferTypeForKernel( state, this );
3856 }
3857
3858 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3859 {
3860 // GPGPU walker - implicit args
3861 for (uint32_t i = numArgs; i < numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM; i++)
3862 {
3863 halKernelParam->argParams[i].unitCount = 1;
3864 halKernelParam->argParams[i].kind = CM_ARGUMENT_GENERAL;
3865 halKernelParam->argParams[i].unitSize = 4;
3866 halKernelParam->argParams[i].payloadOffset = MOS_ALIGN_CEIL(kernelCurbeSize, 4) + (i - numArgs) * sizeof(uint32_t);
3867 halKernelParam->argParams[i].perThread = false;
3868 }
3869
3870 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 0].firstValue, thrdSpaceWidth));
3871 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 1].firstValue, thrdSpaceHeight));
3872 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 2].firstValue, grpSpaceWidth));
3873 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 3].firstValue, grpSpaceHeight));
3874 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 4].firstValue, thrdSpaceWidth));
3875 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup(halKernelParam->argParams[numArgs + 5].firstValue, thrdSpaceHeight));
3876 halKernelParam->localIdIndex = halKernelParam->numArgs - 2;
3877 }
3878 halKernelParam->gpgpuWalkerParams.gpgpuEnabled = true;
3879 halKernelParam->gpgpuWalkerParams.groupWidth = grpSpaceWidth;
3880 halKernelParam->gpgpuWalkerParams.groupHeight = grpSpaceHeight;
3881 halKernelParam->gpgpuWalkerParams.groupDepth = grpSpaceDepth;
3882 halKernelParam->gpgpuWalkerParams.threadHeight = thrdSpaceHeight;
3883 halKernelParam->gpgpuWalkerParams.threadWidth = thrdSpaceWidth;
3884 halKernelParam->gpgpuWalkerParams.threadDepth = thrdSpaceDepth;
3885 //Get SLM size
3886 halKernelParam->slmSize = GetSLMSize();
3887
3888 //Get spill area to adjust scratch space
3889 halKernelParam->spillSize = GetSpillMemUsed();
3890
3891 //Set Barrier mode
3892 halKernelParam->barrierMode = m_barrierMode;
3893 halKernelParam->numberThreadsInGroup = thrdSpaceWidth * thrdSpaceHeight * thrdSpaceDepth;
3894 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3))
3895 kernelCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 4) + CM_GPUWALKER_IMPLICIT_ARG_NUM * sizeof(uint32_t);
3896 else
3897 kernelCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 4);
3898 if ((kernelCurbeSize % 32) == 4) //The per-thread data occupy 2 GRF.
3899 {
3900 halKernelParam->curbeSizePerThread = 64;
3901 }
3902 else
3903 {
3904 halKernelParam->curbeSizePerThread = 32;
3905 }
3906 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3907 halKernelParam->totalCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 32) - halKernelParam->curbeSizePerThread + halKernelParam->curbeSizePerThread *
3908 thrdSpaceWidth * thrdSpaceHeight;
3909 //Since the CURBE is 32 bytes alignment, for GPGPU walker without the user specified thread argument, implicit per-thread id arguments will occupy at most 32 bytes
3910 halKernelParam->crossThreadConstDataLen = MOS_ALIGN_CEIL(kernelCurbeSize, 32) - halKernelParam->curbeSizePerThread;
3911 }
3912 else {
3913 halKernelParam->totalCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 32) + halKernelParam->curbeSizePerThread *
3914 thrdSpaceWidth * thrdSpaceHeight * thrdSpaceDepth;
3915 //Since the CURBE is 32 bytes alignment, for GPGPU walker without the user specified thread argument, implicit per-thread id arguments will occupy at most 32 bytes
3916 halKernelParam->crossThreadConstDataLen = MOS_ALIGN_CEIL(kernelCurbeSize, 32);
3917 }
3918 halKernelParam->payloadSize = 0; // no thread arg allowed
3919
3920 // adjust payloadOffset of local id for visa3.3+
3921 if (adjustLocalIdPayloadOffset)
3922 {
3923 halKernelParam->argParams[halKernelParam->localIdIndex].payloadOffset = halKernelParam->crossThreadConstDataLen;
3924 }
3925
3926 m_sizeInCurbe = GetAlignedCurbeSize(halKernelParam->totalCurbeSize);
3927
3928 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelIndirectData(&halKernelParam->indirectDataParam));
3929
3930 if (m_samplerBtiCount != 0)
3931 {
3932 CmSafeMemCopy((void*)halKernelParam->samplerBTIParam.samplerInfo, (void*)m_samplerBtiEntry, sizeof(m_samplerBtiEntry));
3933 halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
3934
3935 CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
3936 m_samplerBtiCount = 0;
3937 }
3938
3939 CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces);
3940
3941 UpdateKernelDataGlobalSurfaceInfo(halKernelParam);
3942
3943 //Destroy Temp Args
3944 for (uint32_t j = 0; j < numArgs; j++)
3945 {
3946 if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
3947 {
3948 MosSafeDeleteArray(tempArgs[j].value);
3949 }
3950 }
3951 MosSafeDeleteArray(tempArgs);
3952
3953 CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
3954 finish:
3955 if (hr != CM_SUCCESS)
3956 {
3957 //Clean allocated memory : need to count the implicit args
3958 if ((m_program->m_cisaMajorVersion == 3) && (m_program->m_cisaMinorVersion < 3)) {
3959
3960 for (uint32_t i = 0; i < numArgs + CM_GPUWALKER_IMPLICIT_ARG_NUM; i++)
3961 {
3962 if (halKernelParam)
3963 {
3964 if (halKernelParam->argParams[i].firstValue)
3965 {
3966 MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
3967 }
3968 }
3969 }
3970 }
3971 else
3972 {
3973 for (uint32_t i = 0; i < numArgs; i++)
3974 {
3975 if (halKernelParam)
3976 {
3977 if (halKernelParam->argParams[i].firstValue)
3978 {
3979 MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
3980 }
3981 }
3982 }
3983 }
3984 //Destroy Temp Args in failing case
3985 if (tempArgs)
3986 {
3987 for (uint32_t j = 0; j < numArgs; j++)
3988 {
3989 if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
3990 {
3991 MosSafeDeleteArray(tempArgs[j].value);
3992 }
3993 }
3994 MosSafeDeleteArray(tempArgs);
3995 }
3996 }
3997 return hr;
3998 }
3999
4000 //*-----------------------------------------------------------------------------
4001 //| Purpose: Prepare Kernel Data including thread args, kernel args
4002 //| Returns: Result of the operation.
4003 //*-----------------------------------------------------------------------------
IsBatchBufferReusable(CmThreadSpaceRT * taskThreadSpace)4004 bool CmKernelRT::IsBatchBufferReusable( CmThreadSpaceRT * taskThreadSpace )
4005 {
4006 bool reusable = true;
4007 //Update m_id if the batch buffer is not reusable.
4008 if (m_dirty & CM_KERNEL_DATA_THREAD_ARG_DIRTY)
4009 {
4010 reusable = false; // if thread arg dirty
4011 }
4012 else if ((m_dirty & CM_KERNEL_DATA_KERNEL_ARG_DIRTY) && (m_curbeEnabled == false))
4013 {
4014 reusable = false; // if kernel arg dirty and curbe disabled
4015 }
4016 else if (m_dirty & CM_KERNEL_DATA_THREAD_COUNT_DIRTY)
4017 {
4018 reusable = false; // if thread count dirty
4019 }
4020 else if (m_threadSpace)
4021 {
4022 if (m_threadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DATA_DIRTY)
4023 {
4024 reusable = false; // if per kernel thread space exists and it is completely dirty
4025 }
4026 }
4027 else if (taskThreadSpace)
4028 {
4029 if (taskThreadSpace->GetDirtyStatus() == CM_THREAD_SPACE_DATA_DIRTY)
4030 {
4031 reusable = false; // if per task thread space change and it is completely dirty
4032 }
4033 }
4034 return reusable;
4035
4036 }
4037
4038 //*-----------------------------------------------------------------------------
4039 //| Purpose: Checks to see if kernel prologue has changed
4040 //| Returns: Result of the operation.
4041 //*-----------------------------------------------------------------------------
IsPrologueDirty(void)4042 bool CmKernelRT::IsPrologueDirty( void )
4043 {
4044 bool prologueDirty = false;
4045
4046 if( m_threadCount != m_lastThreadCount )
4047 {
4048 if( m_lastThreadCount )
4049 {
4050 if( m_threadCount == 1 || m_lastThreadCount == 1 )
4051 {
4052 prologueDirty = true;
4053 }
4054 }
4055 m_lastThreadCount = m_threadCount;
4056 }
4057
4058 if( m_adjustScoreboardY != m_lastAdjustScoreboardY )
4059 {
4060 if( m_lastAdjustScoreboardY )
4061 {
4062 prologueDirty = true;
4063 }
4064 m_lastAdjustScoreboardY = m_adjustScoreboardY;
4065 }
4066
4067 return prologueDirty;
4068 }
4069
4070 //*-----------------------------------------------------------------------------
4071 //| Purpose: Prepare Kernel Data including thread args, kernel args
4072 //| Returns: Result of the operation.
4073 //*-----------------------------------------------------------------------------
CreateKernelDataInternal(CmKernelData * & kernelData,uint32_t & kernelDataSize,const CmThreadSpaceRT * threadSpace)4074 int32_t CmKernelRT::CreateKernelDataInternal(
4075 CmKernelData* & kernelData, // out
4076 uint32_t& kernelDataSize, // out
4077 const CmThreadSpaceRT* threadSpace ) // in
4078 {
4079 PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
4080 int32_t hr = CM_SUCCESS;
4081 uint32_t movInstNum = 0;
4082 uint32_t kernelCurbeSize = 0;
4083 uint32_t numArgs = 0;
4084 uint32_t bottomRange = 1024;
4085 uint32_t upRange = 0;
4086 uint32_t unitSize = 0;
4087 bool hasThreadArg = false;
4088 CmThreadSpaceRT *cmThreadSpace = nullptr;
4089 bool isKernelThreadSpace = false;
4090 CM_ARG *tempArgs = nullptr;
4091 uint32_t argSize = 0;
4092 uint32_t surfNum = 0; //Pass needed BT entry numbers to HAL CM
4093 CmKernelRT *cmKernel = nullptr;
4094
4095 if( threadSpace == nullptr && m_threadSpace!= nullptr)
4096 {
4097 cmThreadSpace = m_threadSpace;
4098 isKernelThreadSpace = true;
4099 }
4100 else
4101 {
4102 cmThreadSpace = const_cast<CmThreadSpaceRT*>(threadSpace);
4103 }
4104
4105 CM_CHK_CMSTATUS_GOTOFINISH(CmKernelData::Create( this, kernelData ));
4106 halKernelParam = kernelData->GetHalCmKernelData();
4107 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4108
4109 //Get Num of args with surface array
4110 CM_CHK_CMSTATUS_GOTOFINISH(GetArgCountPlusSurfArray(argSize, numArgs));
4111
4112 if( numArgs > 0)
4113 {
4114 //Create Temp args
4115 CM_CHK_CMSTATUS_GOTOFINISH(CreateTempArgs(numArgs, tempArgs));
4116 //Create move instructions
4117 CM_CHK_CMSTATUS_GOTOFINISH(CreateMovInstructions(movInstNum, halKernelParam->movInsData, tempArgs, numArgs));
4118 }
4119
4120 CM_CHK_CMSTATUS_GOTOFINISH(CalcKernelDataSize(movInstNum, numArgs, argSize, kernelDataSize));
4121 CM_CHK_CMSTATUS_GOTOFINISH(kernelData->SetKernelDataSize(kernelDataSize));
4122
4123 if(!IsBatchBufferReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
4124 {
4125 m_id ++;
4126 }
4127
4128 if( IsPrologueDirty( ) )
4129 {
4130 // can't re-use kernel binary in GSH
4131 // just update upper 16 bits
4132 uint64_t tempID = m_id;
4133 tempID >>= 48;
4134 tempID++;
4135 tempID <<= 48;
4136 // get rid of old values in upper 16 bits
4137 m_id <<= 16;
4138 m_id >>= 16;
4139 m_id |= tempID;
4140 }
4141
4142 halKernelParam->clonedKernelParam.isClonedKernel = m_isClonedKernel;
4143 halKernelParam->clonedKernelParam.kernelID = m_cloneKernelID;
4144 halKernelParam->clonedKernelParam.hasClones = m_hasClones;
4145 halKernelParam->kernelId = m_id; // kernel id , high 32-bit is kernel id, low 32-bit is kernel data id for batch buffer reuse
4146 halKernelParam->numArgs = numArgs;
4147 halKernelParam->numThreads = m_threadCount;
4148 halKernelParam->kernelBinarySize = m_binarySize + movInstNum * CM_MOVE_INSTRUCTION_SIZE;
4149 halKernelParam->kernelDataSize = kernelDataSize;
4150 halKernelParam->movInsDataSize = movInstNum * CM_MOVE_INSTRUCTION_SIZE;
4151
4152 halKernelParam->cmFlags = m_curbeEnabled ? CM_FLAG_CURBE_ENABLED : 0;
4153 halKernelParam->cmFlags |= m_nonstallingScoreboardEnabled ? CM_FLAG_NONSTALLING_SCOREBOARD_ENABLED : 0;
4154 halKernelParam->kernelDebugEnabled = m_blhwDebugEnable;
4155
4156 halKernelParam->kernelBinary = (uint8_t*)m_binary;
4157
4158 CM_CHK_CMSTATUS_GOTOFINISH( kernelData->GetCmKernel( cmKernel ) );
4159 if ( cmKernel == nullptr )
4160 {
4161 return CM_NULL_POINTER;
4162 }
4163 MOS_SecureStrcpy( halKernelParam->kernelName, CM_MAX_KERNEL_NAME_SIZE_IN_BYTE, cmKernel->GetName() );
4164
4165 if ( cmThreadSpace )
4166 {// either from per kernel thread space or per task thread space
4167 CM_CHK_CMSTATUS_GOTOFINISH(SortThreadSpace(cmThreadSpace)); // must be called before CreateThreadArgData
4168 }
4169
4170 for(uint32_t i =0 ; i< numArgs; i++)
4171 {
4172 halKernelParam->argParams[i].unitCount = tempArgs[ i ].unitCount;
4173 halKernelParam->argParams[i].kind = (CM_HAL_KERNEL_ARG_KIND)(tempArgs[ i ].unitKind);
4174 halKernelParam->argParams[i].unitSize = tempArgs[ i ].unitSize;
4175 halKernelParam->argParams[i].payloadOffset = tempArgs[ i ].unitOffsetInPayload;
4176 halKernelParam->argParams[i].perThread = (tempArgs[ i ].unitCount > 1) ? true :false;
4177 halKernelParam->argParams[i].nCustomValue = tempArgs[ i ].nCustomValue;
4178 halKernelParam->argParams[i].aliasIndex = tempArgs[ i ].aliasIndex;
4179 halKernelParam->argParams[i].aliasCreated = tempArgs[ i ].aliasCreated;
4180 halKernelParam->argParams[i].isNull = tempArgs[ i ].isNull;
4181
4182 CreateThreadArgData(&halKernelParam->argParams[i], i, cmThreadSpace, tempArgs);
4183
4184 if(CHECK_SURFACE_TYPE ( halKernelParam->argParams[i].kind,
4185 ARG_KIND_SURFACE_VME,
4186 ARG_KIND_SURFACE_SAMPLER,
4187 ARG_KIND_SURFACE2DUP_SAMPLER))
4188 {
4189 unitSize = CM_ARGUMENT_SURFACE_SIZE;
4190 }
4191 else
4192 {
4193 unitSize = halKernelParam->argParams[i].unitSize;
4194 }
4195
4196 if (halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE)
4197 {
4198 if(IsKernelArg(halKernelParam->argParams[i]))
4199 {
4200 // Kernel arg : calculate curbe size & adjust payloadoffset
4201 // Note: Here the payloadOffset may be different from original value
4202 uint32_t offset = halKernelParam->argParams[i].payloadOffset - CM_PAYLOAD_OFFSET;
4203 if (offset >= kernelCurbeSize)
4204 {
4205 kernelCurbeSize = offset + unitSize;
4206 }
4207 halKernelParam->argParams[i].payloadOffset -= CM_PAYLOAD_OFFSET;
4208 }
4209 }
4210
4211 if(!IsKernelArg(halKernelParam->argParams[i]))
4212 { //Thread arg : Calculate payload size & adjust payloadoffset
4213 hasThreadArg = true;
4214 halKernelParam->argParams[i].payloadOffset -= CM_PAYLOAD_OFFSET;
4215
4216 if(halKernelParam->argParams[i].payloadOffset < bottomRange)
4217 {
4218 bottomRange = halKernelParam->argParams[i].payloadOffset;
4219 }
4220 if(halKernelParam->argParams[i].payloadOffset >= upRange)
4221 {
4222 upRange = halKernelParam->argParams[i].payloadOffset + unitSize;
4223 }
4224 }
4225 }
4226
4227 if ( m_stateBufferBounded != CM_STATE_BUFFER_NONE )
4228 {
4229 PCM_CONTEXT_DATA cmData = ( PCM_CONTEXT_DATA )m_device->GetAccelData();
4230 PCM_HAL_STATE state = cmData->cmHalState;
4231 kernelCurbeSize = state->pfnGetStateBufferSizeForKernel( state, this );
4232 halKernelParam->stateBufferType = state->pfnGetStateBufferTypeForKernel( state, this );
4233 }
4234
4235 halKernelParam->payloadSize = hasThreadArg ? MOS_ALIGN_CEIL(upRange - bottomRange, 4): 0;
4236 halKernelParam->totalCurbeSize = MOS_ALIGN_CEIL(kernelCurbeSize, 32);
4237 halKernelParam->curbeSizePerThread = halKernelParam->totalCurbeSize;
4238
4239 halKernelParam->perThreadArgExisted = hasThreadArg;
4240
4241 m_sizeInCurbe = GetAlignedCurbeSize( kernelCurbeSize );
4242
4243 if ( halKernelParam->cmFlags & CM_KERNEL_FLAGS_CURBE )
4244 {
4245 for(uint32_t i=0; i< numArgs; i++)
4246 {
4247 if(!IsKernelArg(halKernelParam->argParams[i]))
4248 { // thread arg: need to minus curbe size
4249 halKernelParam->argParams[i].payloadOffset -= halKernelParam->curbeSizePerThread;
4250 }
4251 }
4252 }
4253
4254 //Create indirect data
4255 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelIndirectData(&halKernelParam->indirectDataParam));
4256
4257 if ( m_samplerBtiCount != 0 )
4258 {
4259 CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4260 halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4261
4262 CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4263 m_samplerBtiCount = 0;
4264 }
4265
4266 CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces);
4267
4268 //Create thread space param: only avaliable if per kernel ts exists
4269 if(m_threadSpace)
4270 {
4271 CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadSpaceParam(&halKernelParam->kernelThreadSpaceParam, m_threadSpace));
4272 }
4273
4274 //Get SLM size
4275 halKernelParam->slmSize = GetSLMSize();
4276
4277 //Get Spill mem used
4278 halKernelParam->spillSize = GetSpillMemUsed();
4279
4280 //Set Barrier mode
4281 halKernelParam->barrierMode = m_barrierMode;
4282
4283 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4284
4285 //Destroy Temp Args
4286 for (uint32_t j = 0; j < numArgs; j++)
4287 {
4288 if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
4289 {
4290 MosSafeDeleteArray(tempArgs[j].value);
4291 }
4292 }
4293 MosSafeDeleteArray( tempArgs );
4294
4295 CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4296 finish:
4297 if(hr != CM_SUCCESS)
4298 {
4299 if(halKernelParam)
4300 {
4301 //Clean allocated memory
4302 for(uint32_t i =0 ; i< numArgs; i++)
4303 {
4304 if( halKernelParam->argParams[i].firstValue )
4305 {
4306 MosSafeDeleteArray(halKernelParam->argParams[i].firstValue);
4307 }
4308 }
4309 }
4310
4311 //Destroy Temp Args
4312 if (tempArgs)
4313 {
4314 for (uint32_t j = 0; j < numArgs; j++)
4315 {
4316 if (tempArgs[j].unitOffsetInPayloadOrig == (uint16_t)-1)
4317 {
4318 MosSafeDeleteArray(tempArgs[j].value);
4319 }
4320 }
4321 MosSafeDeleteArray(tempArgs);
4322 }
4323 }
4324 return hr;
4325 }
4326
4327 //*-----------------------------------------------------------------------------
4328 //| Purpose: Update kernel data's kernel arg, thread arg, thread count
4329 //| Returns: Result of the operation.
4330 //*-----------------------------------------------------------------------------
UpdateKernelData(CmKernelData * kernelData,const CmThreadSpaceRT * threadSpace)4331 int32_t CmKernelRT::UpdateKernelData(
4332 CmKernelData* kernelData, // in
4333 const CmThreadSpaceRT* threadSpace)
4334 {
4335 int32_t hr = CM_SUCCESS;
4336 PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
4337 bool bbResuable = true;
4338 CmThreadSpaceRT *cmThreadSpace = nullptr;
4339 bool isKernelThreadSpace = false;
4340 uint32_t argIndexStep = 0;
4341 uint32_t argIndex = 0;
4342 uint32_t surfNum = 0; //Update Number of surface used by kernel
4343
4344 if( threadSpace == nullptr && m_threadSpace!= nullptr)
4345 {
4346 cmThreadSpace = m_threadSpace;
4347 isKernelThreadSpace = true;
4348 }
4349 else
4350 {
4351 cmThreadSpace = const_cast<CmThreadSpaceRT*>(threadSpace);
4352 }
4353
4354 CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
4355 CM_ASSERT(kernelData->IsInUse() == false);
4356
4357 halKernelParam = kernelData->GetHalCmKernelData();
4358 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4359
4360 if(!IsBatchBufferReusable(const_cast<CmThreadSpaceRT *>(threadSpace)))
4361 {
4362 m_id ++;
4363 halKernelParam->kernelId = m_id;
4364 }
4365
4366 //Update arguments
4367 for(uint32_t orgArgIndex =0 ; orgArgIndex< m_argCount; orgArgIndex++)
4368 {
4369 argIndexStep = 1;
4370
4371 if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4372 ARG_KIND_SURFACE,
4373 ARG_KIND_SURFACE_1D,
4374 ARG_KIND_SURFACE_2D,
4375 ARG_KIND_SURFACE_2D_UP,
4376 ARG_KIND_SURFACE_SAMPLER,
4377 ARG_KIND_SURFACE2DUP_SAMPLER,
4378 ARG_KIND_SURFACE_3D,
4379 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4380 ARG_KIND_SURFACE_SAMPLER8X8_VA,
4381 ARG_KIND_SURFACE_2D_SCOREBOARD,
4382 ARG_KIND_STATE_BUFFER ) )
4383 {
4384 argIndexStep = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array exists
4385 }
4386 else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4387 {
4388 argIndexStep = m_args[orgArgIndex].unitVmeArraySize;
4389 }
4390
4391 if(m_args[ orgArgIndex ].isDirty)
4392 {
4393 if(m_args[ orgArgIndex ].unitCount > 1)
4394 { // thread arg is dirty
4395 bbResuable = false;
4396 }
4397
4398 if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4399 ARG_KIND_SURFACE,
4400 ARG_KIND_SURFACE_1D,
4401 ARG_KIND_SURFACE_2D,
4402 ARG_KIND_SURFACE_2D_UP,
4403 ARG_KIND_SURFACE_SAMPLER,
4404 ARG_KIND_SURFACE2DUP_SAMPLER,
4405 ARG_KIND_SURFACE_3D,
4406 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4407 ARG_KIND_SURFACE_SAMPLER8X8_VA,
4408 ARG_KIND_SURFACE_2D_SCOREBOARD,
4409 ARG_KIND_STATE_BUFFER ) )
4410 { // for surface args
4411
4412 uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4413 if(m_args[ orgArgIndex ].unitCount == 1) // kernel arg
4414 {
4415 if (numSurfaces > 1)
4416 {
4417 for (uint32_t kk = 0; kk < numSurfaces; kk++)
4418 {
4419 CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4420 CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4421 m_args[orgArgIndex].value + kk*sizeof(uint32_t), sizeof(uint32_t));
4422 halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4423 halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4424 halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4425
4426 if (!m_args[orgArgIndex].surfIndex[kk])
4427 {
4428 //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
4429 //This is for special usage if there is empty element in surface array.
4430 halKernelParam->argParams[argIndex + kk].kind = CM_ARGUMENT_SURFACE2D;
4431 continue;
4432 }
4433
4434 halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].surfArrayArg[kk].argKindForArray;
4435 halKernelParam->argParams[argIndex + kk].nCustomValue = m_args[orgArgIndex].surfArrayArg[kk].addressModeForArray;
4436 }
4437 }
4438 else
4439 {
4440 CM_ASSERT(halKernelParam->argParams[argIndex].firstValue != nullptr);
4441 CmSafeMemCopy(halKernelParam->argParams[argIndex].firstValue,
4442 m_args[ orgArgIndex ].value, sizeof(uint32_t));
4443 halKernelParam->argParams[argIndex].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[ orgArgIndex ].unitKind;
4444 halKernelParam->argParams[argIndex].aliasIndex = m_args[orgArgIndex].aliasIndex;
4445 halKernelParam->argParams[argIndex].aliasCreated = m_args[orgArgIndex].aliasCreated;
4446 halKernelParam->argParams[argIndex].isNull = m_args[orgArgIndex].isNull;
4447 }
4448
4449 }
4450 else // thread arg
4451 {
4452 uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4453 uint32_t *surfaces = (uint32_t *)MOS_NewArray(uint8_t, (sizeof(uint32_t) * m_args[orgArgIndex].unitCount));
4454 CM_CHK_NULL_GOTOFINISH(surfaces, CM_OUT_OF_HOST_MEMORY);
4455 for (uint32_t kk=0; kk< numSurfaces ; kk++)
4456 {
4457 for (uint32_t s = 0; s < m_args[orgArgIndex].unitCount; s++)
4458 {
4459 surfaces[s] = *(uint32_t *)((uint32_t *)m_args[orgArgIndex].value + kk + numSurfaces * s);
4460 }
4461 CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4462 surfaces, sizeof(uint32_t) * m_args[orgArgIndex].unitCount);
4463
4464 halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[ orgArgIndex ].unitKind;
4465
4466 halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4467 halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4468 halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4469
4470 }
4471 MosSafeDeleteArray(surfaces);
4472 }
4473
4474 }
4475 else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4476 {
4477 uint32_t numSurfaces = m_args[orgArgIndex].unitVmeArraySize;
4478 if (m_args[orgArgIndex].unitCount == 1) // kernel arg
4479 {
4480 uint32_t vmeSurfOffset = 0;
4481 for (uint32_t kk = 0; kk< numSurfaces; kk++)
4482 {
4483 uint16_t vmeSize = (uint16_t)getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[orgArgIndex].value + vmeSurfOffset));
4484
4485 // reallocate the firstValue for VME surface every time
4486 // since the number of surfaces may vary
4487 MosSafeDeleteArray(halKernelParam->argParams[argIndex + kk].firstValue);
4488 halKernelParam->argParams[argIndex + kk].firstValue = MOS_NewArray(uint8_t, vmeSize);
4489 CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4490 CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4491 m_args[orgArgIndex].value + vmeSurfOffset, vmeSize);
4492
4493 halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4494
4495 halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4496 halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4497 halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4498 halKernelParam->argParams[argIndex + kk].unitSize = vmeSize;
4499 vmeSurfOffset += vmeSize;
4500 }
4501 }
4502 }
4503 else
4504 {
4505 CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadArgData(&halKernelParam->argParams[argIndex ], orgArgIndex, cmThreadSpace, m_args));
4506 }
4507 }
4508 argIndex += argIndexStep;
4509 }
4510
4511 //Update Thread space param
4512 if(m_threadSpace && m_threadSpace->GetDirtyStatus())
4513 {
4514
4515 CM_CHK_CMSTATUS_GOTOFINISH(SortThreadSpace(m_threadSpace));
4516
4517 uint32_t threadSpaceWidth = 0, threadSpaceHeight = 0;
4518 PCM_HAL_KERNEL_THREADSPACE_PARAM cmKernelThreadSpaceParam = &halKernelParam->kernelThreadSpaceParam;
4519 m_threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
4520
4521 cmKernelThreadSpaceParam->threadSpaceWidth = (uint16_t)threadSpaceWidth;
4522 cmKernelThreadSpaceParam->threadSpaceHeight = (uint16_t)threadSpaceHeight;
4523 m_threadSpace->GetDependencyPatternType(cmKernelThreadSpaceParam->patternType);
4524 m_threadSpace->GetWalkingPattern(cmKernelThreadSpaceParam->walkingPattern);
4525 m_threadSpace->GetColorCountMinusOne(cmKernelThreadSpaceParam->colorCountMinusOne);
4526
4527 CM_HAL_DEPENDENCY* dependency = nullptr;
4528 m_threadSpace->GetDependency( dependency);
4529
4530 if(dependency != nullptr)
4531 {
4532 CmSafeMemCopy(&cmKernelThreadSpaceParam->dependencyInfo, dependency, sizeof(CM_HAL_DEPENDENCY));
4533 }
4534
4535 if( m_threadSpace->CheckWalkingParametersSet() )
4536 {
4537 CM_CHK_CMSTATUS_GOTOFINISH(m_threadSpace->GetWalkingParameters(cmKernelThreadSpaceParam->walkingParams));
4538 }
4539
4540 if( m_threadSpace->CheckDependencyVectorsSet() )
4541 {
4542 CM_CHK_CMSTATUS_GOTOFINISH(m_threadSpace->GetDependencyVectors(cmKernelThreadSpaceParam->dependencyVectors));
4543 }
4544
4545 if(m_threadSpace->IsThreadAssociated())
4546 {// media object only
4547 uint32_t *boardOrder = nullptr;
4548 m_threadSpace->GetBoardOrder(boardOrder);
4549 CM_CHK_NULL_GOTOFINISH_CMERROR(boardOrder);
4550
4551 CM_THREAD_SPACE_UNIT *threadSpaceUnit = nullptr;
4552 m_threadSpace->GetThreadSpaceUnit(threadSpaceUnit);
4553 CM_CHK_NULL_GOTOFINISH_CMERROR(threadSpaceUnit);
4554
4555 cmKernelThreadSpaceParam->reuseBBUpdateMask = 0;
4556 for(uint32_t i=0; i< threadSpaceWidth * threadSpaceHeight ; i++)
4557 {
4558 cmKernelThreadSpaceParam->threadCoordinates[i].x = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.x;
4559 cmKernelThreadSpaceParam->threadCoordinates[i].y = threadSpaceUnit[boardOrder[i]].scoreboardCoordinates.y;
4560 cmKernelThreadSpaceParam->threadCoordinates[i].mask = threadSpaceUnit[boardOrder[i]].dependencyMask;
4561 cmKernelThreadSpaceParam->threadCoordinates[i].resetMask = threadSpaceUnit[boardOrder[i]].reset;
4562 cmKernelThreadSpaceParam->threadCoordinates[i].color = threadSpaceUnit[boardOrder[i]].scoreboardColor;
4563 cmKernelThreadSpaceParam->threadCoordinates[i].sliceSelect = threadSpaceUnit[boardOrder[i]].sliceDestinationSelect;
4564 cmKernelThreadSpaceParam->threadCoordinates[i].subSliceSelect = threadSpaceUnit[boardOrder[i]].subSliceDestinationSelect;
4565 cmKernelThreadSpaceParam->reuseBBUpdateMask |= threadSpaceUnit[boardOrder[i]].reset;
4566 }
4567
4568 if( cmKernelThreadSpaceParam->patternType == CM_WAVEFRONT26Z )
4569 {
4570 CM_HAL_WAVEFRONT26Z_DISPATCH_INFO dispatchInfo;
4571 m_threadSpace->GetWavefront26ZDispatchInfo(dispatchInfo);
4572
4573 if (cmKernelThreadSpaceParam->dispatchInfo.numWaves >= dispatchInfo.numWaves)
4574 {
4575 cmKernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
4576 CmSafeMemCopy(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
4577 }
4578 else
4579 {
4580 cmKernelThreadSpaceParam->dispatchInfo.numWaves = dispatchInfo.numWaves;
4581 MosSafeDeleteArray(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave);
4582 cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave = MOS_NewArray(uint32_t, dispatchInfo.numWaves);
4583 CM_CHK_NULL_GOTOFINISH(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, CM_OUT_OF_HOST_MEMORY);
4584 CmSafeMemCopy(cmKernelThreadSpaceParam->dispatchInfo.numThreadsInWave, dispatchInfo.numThreadsInWave, dispatchInfo.numWaves*sizeof(uint32_t));
4585 }
4586 }
4587 }
4588 }
4589
4590 // Update indirect data
4591 if( m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY)
4592 {
4593 halKernelParam->indirectDataParam.indirectDataSize = m_usKernelPayloadDataSize;
4594 halKernelParam->indirectDataParam.surfaceCount = m_usKernelPayloadSurfaceCount;
4595
4596 if(m_usKernelPayloadDataSize != 0)
4597 {
4598 if(m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY)
4599 { // size change, need to reallocate
4600 MosSafeDeleteArray(halKernelParam->indirectDataParam.indirectData);
4601 halKernelParam->indirectDataParam.indirectData = MOS_NewArray(uint8_t, m_usKernelPayloadDataSize);
4602 CM_CHK_NULL_GOTOFINISH(halKernelParam->indirectDataParam.indirectData, CM_OUT_OF_HOST_MEMORY);
4603 }
4604 CmSafeMemCopy(halKernelParam->indirectDataParam.indirectData, (void *)m_kernelPayloadData, m_usKernelPayloadDataSize);
4605 }
4606
4607 if(m_usKernelPayloadSurfaceCount != 0)
4608 {
4609 if(m_dirty & CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY)
4610 { // size change, need to reallocate
4611 MosSafeDeleteArray(halKernelParam->indirectDataParam.surfaceInfo);
4612 halKernelParam->indirectDataParam.surfaceInfo = MOS_NewArray(CM_INDIRECT_SURFACE_INFO, m_usKernelPayloadSurfaceCount);
4613 CM_CHK_NULL_GOTOFINISH(halKernelParam->indirectDataParam.surfaceInfo, CM_OUT_OF_HOST_MEMORY);
4614
4615 }
4616 CmSafeMemCopy((void*)halKernelParam->indirectDataParam.surfaceInfo, (void*)m_IndirectSurfaceInfoArray,
4617 m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4618 //clear m_IndirectSurfaceInfoArray every enqueue
4619 CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4620 m_usKernelPayloadSurfaceCount = 0;
4621 }
4622 }
4623
4624 if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
4625 {
4626 if ( m_samplerBtiCount != 0 )
4627 {
4628 CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4629 halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4630
4631 CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4632 m_samplerBtiCount = 0;
4633 }
4634 }
4635 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4636
4637 CM_CHK_CMSTATUS_GOTOFINISH(CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces));
4638
4639 CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4640
4641 finish:
4642 if( hr != CM_SUCCESS)
4643 {
4644 if( halKernelParam )
4645 {
4646 MosSafeDeleteArray(halKernelParam->indirectDataParam.indirectData);
4647 MosSafeDeleteArray(halKernelParam->indirectDataParam.surfaceInfo);
4648 }
4649 }
4650 return hr;
4651 }
4652
4653 //*-----------------------------------------------------------------------------
4654 //| Purpose: Update kernel data's kernel arg, thread arg, thread count
4655 //| Returns: Result of the operation.
4656 //*-----------------------------------------------------------------------------
UpdateKernelData(CmKernelData * kernelData,const CmThreadGroupSpace * threadGroupSpace)4657 int32_t CmKernelRT::UpdateKernelData(
4658 CmKernelData* kernelData, // in
4659 const CmThreadGroupSpace* threadGroupSpace ) // in
4660 {
4661 int32_t hr = CM_SUCCESS;
4662 PCM_HAL_KERNEL_PARAM halKernelParam = nullptr;
4663 uint32_t argIndexStep = 0;
4664 uint32_t argIndex = 0;
4665 uint32_t surfNum = 0;
4666 auto getVersionAsInt = [](int major, int minor) { return major * 100 + minor; };
4667
4668 CM_CHK_NULL_GOTOFINISH_CMERROR(kernelData);
4669 CM_ASSERT(kernelData->IsInUse() == false);
4670
4671 halKernelParam = kernelData->GetHalCmKernelData();
4672 CM_CHK_NULL_GOTOFINISH_CMERROR(halKernelParam);
4673
4674 CM_CHK_NULL_GOTOFINISH_CMERROR(threadGroupSpace);
4675
4676 //Update arguments
4677 for(uint32_t orgArgIndex =0 ; orgArgIndex< m_argCount; orgArgIndex++)
4678 {
4679 argIndexStep = 1;
4680
4681 if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4682 ARG_KIND_SURFACE,
4683 ARG_KIND_SURFACE_1D,
4684 ARG_KIND_SURFACE_2D,
4685 ARG_KIND_SURFACE_2D_UP,
4686 ARG_KIND_SURFACE_SAMPLER,
4687 ARG_KIND_SURFACE2DUP_SAMPLER,
4688 ARG_KIND_SURFACE_3D,
4689 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4690 ARG_KIND_SURFACE_SAMPLER8X8_VA,
4691 ARG_KIND_SURFACE_2D_SCOREBOARD,
4692 ARG_KIND_STATE_BUFFER ) )
4693 {
4694 argIndexStep = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array exists
4695 }
4696 else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4697 {
4698 argIndexStep = m_args[orgArgIndex].unitVmeArraySize;
4699 }
4700
4701 if(m_args[ orgArgIndex ].isDirty)
4702 {
4703 if(m_args[ orgArgIndex ].unitCount > 1)
4704 { // thread arg is dirty
4705 CM_ASSERTMESSAGE("Error: Thread arg is not allowed in GPGPU walker.");
4706 hr = CM_FAILURE; // Thread arg is not allowed in GPGPU walker
4707 goto finish;
4708 }
4709
4710 if ( CHECK_SURFACE_TYPE( m_args[ orgArgIndex ].unitKind,
4711 ARG_KIND_SURFACE,
4712 ARG_KIND_SURFACE_1D,
4713 ARG_KIND_SURFACE_2D,
4714 ARG_KIND_SURFACE_2D_UP,
4715 ARG_KIND_SURFACE_SAMPLER,
4716 ARG_KIND_SURFACE2DUP_SAMPLER,
4717 ARG_KIND_SURFACE_3D,
4718 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
4719 ARG_KIND_SURFACE_SAMPLER8X8_VA,
4720 ARG_KIND_SURFACE_2D_SCOREBOARD,
4721 ARG_KIND_STATE_BUFFER ) )
4722 { // for surface args
4723 uint32_t numSurfaces = m_args[orgArgIndex].unitSize/sizeof(int); // Surface array
4724 if(m_args[ orgArgIndex ].unitCount == 1) // kernel arg
4725 {
4726 if (numSurfaces > 1 )
4727 {
4728 for(uint32_t kk=0; kk< numSurfaces ; kk++)
4729 {
4730 CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue
4731 != nullptr);
4732 CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4733 m_args[ orgArgIndex ].value + kk*sizeof(uint32_t),
4734 sizeof(uint32_t));
4735 halKernelParam->argParams[argIndex + kk].aliasIndex
4736 = m_args[orgArgIndex].aliasIndex;
4737 halKernelParam->argParams[argIndex + kk].aliasCreated
4738 = m_args[orgArgIndex].aliasCreated;
4739 halKernelParam->argParams[argIndex + kk].isNull
4740 = m_args[orgArgIndex].isNull;
4741
4742 if (!m_args[orgArgIndex].surfIndex[kk])
4743 {
4744 //if surfIndex is 0, set kind to be CM_ARGUMENT_SURFACE2D
4745 //This is for special usage if there is empty element in surface array.
4746 halKernelParam->argParams[argIndex + kk].kind = CM_ARGUMENT_SURFACE2D;
4747 continue;
4748 }
4749 halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4750 halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].surfArrayArg[kk].argKindForArray;
4751 halKernelParam->argParams[argIndex + kk].nCustomValue = m_args[orgArgIndex].surfArrayArg[kk].addressModeForArray;
4752
4753 }
4754 }
4755 else
4756 {
4757 CM_ASSERT(halKernelParam->argParams[argIndex].firstValue != nullptr);
4758 halKernelParam->argParams[argIndex].kind
4759 = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4760 halKernelParam->argParams[argIndex].aliasIndex
4761 = m_args[orgArgIndex].aliasIndex;
4762 halKernelParam->argParams[argIndex].aliasCreated
4763 = m_args[orgArgIndex].aliasCreated;
4764 halKernelParam->argParams[argIndex].isNull
4765 = m_args[orgArgIndex].isNull;
4766 if (halKernelParam->argParams[argIndex].isNull)
4767 {
4768 *(halKernelParam->argParams[argIndex].firstValue)
4769 = 0;
4770 }
4771 else
4772 {
4773 CmSafeMemCopy(
4774 halKernelParam->argParams[argIndex].firstValue,
4775 m_args[orgArgIndex].value, sizeof(uint32_t));
4776 }
4777 }
4778 }
4779 }
4780 else if (CHECK_SURFACE_TYPE(m_args[orgArgIndex].unitKind, ARG_KIND_SURFACE_VME))
4781 {
4782 uint32_t numSurfaces = m_args[orgArgIndex].unitVmeArraySize;
4783 if (m_args[orgArgIndex].unitCount == 1) // kernel arg
4784 {
4785 uint32_t vmeSurfOffset = 0;
4786 for (uint32_t kk = 0; kk< numSurfaces; kk++)
4787 {
4788 uint32_t vmeSize = getVmeArgValueSize((PCM_HAL_VME_ARG_VALUE)(m_args[orgArgIndex].value + vmeSurfOffset));
4789
4790 // reallocate the firstValue for VME surface every time
4791 // since the number of surfaces may vary
4792 MosSafeDeleteArray(halKernelParam->argParams[argIndex + kk].firstValue);
4793 halKernelParam->argParams[argIndex + kk].firstValue = MOS_NewArray(uint8_t, vmeSize);
4794 CM_ASSERT(halKernelParam->argParams[argIndex + kk].firstValue != nullptr);
4795 CmSafeMemCopy(halKernelParam->argParams[argIndex + kk].firstValue,
4796 m_args[orgArgIndex].value + vmeSurfOffset, vmeSize);
4797
4798 halKernelParam->argParams[argIndex + kk].kind = (CM_HAL_KERNEL_ARG_KIND)m_args[orgArgIndex].unitKind;
4799
4800 halKernelParam->argParams[argIndex + kk].aliasIndex = m_args[orgArgIndex].aliasIndex;
4801 halKernelParam->argParams[argIndex + kk].aliasCreated = m_args[orgArgIndex].aliasCreated;
4802 halKernelParam->argParams[argIndex + kk].isNull = m_args[orgArgIndex].isNull;
4803 halKernelParam->argParams[argIndex + kk].unitSize = m_args[orgArgIndex].unitSize;
4804 vmeSurfOffset += vmeSize;
4805 }
4806 }
4807 }
4808 else
4809 {
4810 CM_CHK_CMSTATUS_GOTOFINISH(CreateThreadArgData(&halKernelParam->argParams[argIndex ], orgArgIndex, nullptr, m_args));
4811 }
4812 }
4813 argIndex += argIndexStep;
4814 }
4815
4816 if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
4817 {
4818 if ( m_samplerBtiCount != 0 )
4819 {
4820 CmSafeMemCopy( ( void* )halKernelParam->samplerBTIParam.samplerInfo, ( void* )m_samplerBtiEntry, sizeof( m_samplerBtiEntry ) );
4821 halKernelParam->samplerBTIParam.samplerCount = m_samplerBtiCount;
4822
4823 CmSafeMemSet(m_samplerBtiEntry, 0, sizeof(m_samplerBtiEntry));
4824 m_samplerBtiCount = 0;
4825 }
4826 }
4827
4828 CM_CHK_CMSTATUS_GOTOFINISH(UpdateKernelDataGlobalSurfaceInfo( halKernelParam ));
4829
4830 CM_CHK_CMSTATUS_GOTOFINISH(CalculateKernelSurfacesNum(surfNum, halKernelParam->numSurfaces));
4831
4832 // GPGPU walker - implicit args
4833 uint32_t thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth;
4834 threadGroupSpace->GetThreadGroupSpaceSize(thrdSpaceWidth, thrdSpaceHeight, thrdSpaceDepth, grpSpaceWidth, grpSpaceHeight, grpSpaceDepth);
4835
4836 halKernelParam->gpgpuWalkerParams.groupDepth = grpSpaceDepth;
4837 halKernelParam->gpgpuWalkerParams.groupHeight = grpSpaceHeight;
4838 halKernelParam->gpgpuWalkerParams.groupWidth = grpSpaceWidth;
4839 halKernelParam->gpgpuWalkerParams.threadDepth = thrdSpaceDepth;
4840 halKernelParam->gpgpuWalkerParams.threadWidth = thrdSpaceWidth;
4841 halKernelParam->gpgpuWalkerParams.threadHeight = thrdSpaceHeight;
4842
4843 if (getVersionAsInt(m_program->m_cisaMajorVersion, m_program->m_cisaMinorVersion) < getVersionAsInt(3, 3))
4844 {
4845 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 0].firstValue, thrdSpaceWidth));
4846 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 1].firstValue, thrdSpaceHeight));
4847 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 2].firstValue, grpSpaceWidth));
4848 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 3].firstValue, grpSpaceHeight));
4849 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 4].firstValue, thrdSpaceWidth));
4850 CM_CHK_CMSTATUS_GOTOFINISH(CreateKernelArgDataGroup (halKernelParam->argParams[argIndex + 5].firstValue, thrdSpaceHeight));
4851 }
4852
4853 CM_CHK_CMSTATUS_GOTOFINISH(UpdateSamplerHeap(kernelData));
4854 finish:
4855 return hr;
4856 }
4857
4858 //*-----------------------------------------------------------------------------
4859 //| Purpose: Create kernel indirect data
4860 //| Returns: Result of the operation.
4861 //*-----------------------------------------------------------------------------
CreateKernelIndirectData(PCM_HAL_INDIRECT_DATA_PARAM halIndirectData)4862 int32_t CmKernelRT::CreateKernelIndirectData(
4863 PCM_HAL_INDIRECT_DATA_PARAM halIndirectData ) // in/out
4864 {
4865 int32_t hr = CM_SUCCESS;
4866
4867 halIndirectData->indirectDataSize = m_usKernelPayloadDataSize;
4868 halIndirectData->surfaceCount = m_usKernelPayloadSurfaceCount;
4869
4870 if( halIndirectData->indirectData == nullptr && m_usKernelPayloadDataSize != 0)
4871 {
4872 halIndirectData->indirectData = MOS_NewArray(uint8_t, halIndirectData->indirectDataSize);
4873 CM_CHK_NULL_GOTOFINISH(halIndirectData->indirectData, CM_OUT_OF_HOST_MEMORY);
4874 }
4875
4876 // For future kernel data, pKbyte is starting point
4877 if( halIndirectData->surfaceInfo == nullptr && m_usKernelPayloadSurfaceCount != 0)
4878 {
4879 halIndirectData->surfaceInfo = MOS_NewArray(CM_INDIRECT_SURFACE_INFO, halIndirectData->surfaceCount);
4880 CM_CHK_NULL_GOTOFINISH(halIndirectData->surfaceInfo, CM_OUT_OF_HOST_MEMORY);
4881 }
4882
4883 if(m_usKernelPayloadDataSize != 0)
4884 {
4885 CmSafeMemCopy(halIndirectData->indirectData, (void *)m_kernelPayloadData, m_usKernelPayloadDataSize);
4886 }
4887
4888 if(m_usKernelPayloadSurfaceCount != 0)
4889 {
4890 CmSafeMemCopy((void*)halIndirectData->surfaceInfo, (void*)m_IndirectSurfaceInfoArray,
4891 m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4892 //clear m_IndirectSurfaceInfoArray every enqueue
4893 CmSafeMemSet(m_IndirectSurfaceInfoArray, 0, m_usKernelPayloadSurfaceCount * sizeof(CM_INDIRECT_SURFACE_INFO));
4894 m_usKernelPayloadSurfaceCount = 0;
4895 }
4896 finish:
4897 if( hr != CM_SUCCESS)
4898 {
4899 if(halIndirectData->indirectData) MosSafeDeleteArray(halIndirectData->indirectData);
4900 if(halIndirectData->surfaceInfo) MosSafeDeleteArray(halIndirectData->surfaceInfo);
4901 }
4902 return hr;
4903 }
4904
4905 //*-----------------------------------------------------------------------------
4906 //| Purpose: UpdateLastKernelData
4907 //| Returns: Result of the operation.
4908 //*-----------------------------------------------------------------------------
UpdateLastKernelData(CmKernelData * & kernelData)4909 int32_t CmKernelRT::UpdateLastKernelData(
4910 CmKernelData* & kernelData) // in
4911 {
4912 int32_t hr = CM_SUCCESS;
4913
4914 if( kernelData == nullptr || m_lastKernelData == kernelData )
4915 {
4916 CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
4917 return CM_NULL_POINTER;
4918 }
4919
4920 if(m_lastKernelData)
4921 {
4922 CmKernelData::Destroy(m_lastKernelData); // reduce ref count or delete it
4923 }
4924 CSync* kernelLock = m_device->GetProgramKernelLock();
4925 CLock locker(*kernelLock);
4926 m_lastKernelData = kernelData;
4927 m_lastKernelData->Acquire();
4928 m_lastKernelDataSize = m_lastKernelData->GetKernelDataSize();
4929
4930 return hr;
4931 }
4932
4933 //*-----------------------------------------------------------------------------
4934 //| Purpose: Wrapper of CmKernelData::Destroy.
4935 //| Returns: Result of the operation.
4936 //*-----------------------------------------------------------------------------
ReleaseKernelData(CmKernelData * & kernelData)4937 int32_t CmKernelRT::ReleaseKernelData(
4938 CmKernelData* & kernelData)
4939 {
4940 int32_t hr = CM_SUCCESS;
4941
4942 if( kernelData == nullptr)
4943 {
4944 CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
4945 return CM_NULL_POINTER;
4946 }
4947
4948 CSync* kernelLock = m_device->GetProgramKernelLock();
4949 CLock locker(*kernelLock);
4950
4951 if(m_lastKernelData == kernelData)
4952 {
4953 // If the kernel data is the last kernel data
4954 // Need to update m_lastKernelData.
4955 hr = CmKernelData::Destroy(m_lastKernelData);
4956 }
4957 else
4958 {
4959 hr = CmKernelData::Destroy(kernelData);
4960 }
4961
4962 return hr;
4963 }
4964
4965 //*-----------------------------------------------------------------------------
4966 //| Purpose: Acquire Kernel and Program
4967 //*-----------------------------------------------------------------------------
AcquireKernelProgram()4968 int32_t CmKernelRT::AcquireKernelProgram()
4969 {
4970 CSync* kernelLock = m_device->GetProgramKernelLock();
4971 CLock locker(*kernelLock);
4972
4973 this->Acquire(); // increase kernel's ref count
4974 m_program->Acquire(); // increase program's ref count
4975
4976 return CM_SUCCESS;
4977 }
4978
4979 //*-----------------------------------------------------------------------------
4980 //| Purpose: Acquire KenrelData, Kernel and Program
4981 //*-----------------------------------------------------------------------------
AcquireKernelData(CmKernelData * & kernelData)4982 int32_t CmKernelRT::AcquireKernelData(
4983 CmKernelData * &kernelData)
4984 {
4985 int32_t hr = CM_SUCCESS;
4986
4987 if (kernelData == nullptr)
4988 {
4989 CM_ASSERTMESSAGE("Error: Invalid kernel data handle.");
4990 return CM_NULL_POINTER;
4991 }
4992
4993 CSync* kernelLock = m_device->GetProgramKernelLock();
4994 CLock locker(*kernelLock);
4995 kernelData->Acquire(); // increase kernel data's ref count
4996
4997 return hr;
4998 }
4999
SetAsClonedKernel(uint32_t cloneKernelID)5000 void CmKernelRT::SetAsClonedKernel(uint32_t cloneKernelID)
5001 {
5002 m_isClonedKernel = true;
5003 m_cloneKernelID = cloneKernelID;
5004 }
5005
GetCloneKernelID(uint32_t & cloneKernelID)5006 bool CmKernelRT::GetCloneKernelID(uint32_t& cloneKernelID)
5007 {
5008 if (m_isClonedKernel)
5009 {
5010 cloneKernelID = m_cloneKernelID;
5011 return true;
5012 }
5013
5014 return false;
5015 }
5016
SetHasClones()5017 void CmKernelRT::SetHasClones()
5018 {
5019 m_hasClones = true;
5020 }
5021
5022 //*-----------------------------------------------------------------------------
5023 //| Purpose: Clone/copy current kernel
5024 //| Returns: New kernel with content of source kernel
5025 //*-----------------------------------------------------------------------------
CloneKernel(CmKernelRT * & kernelOut,uint32_t id)5026 int32_t CmKernelRT::CloneKernel(CmKernelRT *& kernelOut, uint32_t id)
5027 {
5028 int32_t hr = CM_SUCCESS;
5029
5030 CSync* kernelLock = m_device->GetProgramKernelLock();
5031 CLock locker(*kernelLock);
5032
5033 CmDynamicArray * kernelArray = m_device->GetKernelArray();
5034
5035 uint32_t freeSlotinKernelArray = kernelArray->GetFirstFreeIndex();
5036
5037 hr = Create(m_device, m_program, (char*)GetName(), freeSlotinKernelArray, id, kernelOut, m_options);
5038
5039 if (hr == CM_SUCCESS)
5040 {
5041 kernelOut->SetAsClonedKernel(m_id >> 32);
5042 kernelArray->SetElement(freeSlotinKernelArray, kernelOut);
5043 uint32_t *kernelCount = m_device->GetKernelCount();
5044 *kernelCount = *kernelCount + 1;
5045
5046 SetHasClones();
5047 }
5048
5049 return hr;
5050 }
5051
5052 //*-----------------------------------------------------------------------------
5053 //| Purpose: Set Kernel's index in one task
5054 //| Returns: Result of the operation.
5055 //*-----------------------------------------------------------------------------
SetIndexInTask(uint32_t index)5056 int32_t CmKernelRT::SetIndexInTask(uint32_t index)
5057 {
5058 m_indexInTask = index;
5059 return CM_SUCCESS;
5060 }
5061
5062 //*-----------------------------------------------------------------------------
5063 //| Purpose: Get Kernel's index in one task
5064 //| Returns: Result of the operation.
5065 //*-----------------------------------------------------------------------------
GetIndexInTask(void)5066 uint32_t CmKernelRT::GetIndexInTask(void)
5067 {
5068 return m_indexInTask;
5069 }
5070
5071 //*-----------------------------------------------------------------------------
5072 //| Purpose: Set Associated Flag
5073 //| Returns: Result of the operation.
5074 //*-----------------------------------------------------------------------------
SetAssociatedToTSFlag(bool b)5075 int32_t CmKernelRT::SetAssociatedToTSFlag(bool b)
5076 {
5077 m_threadSpaceAssociated = b;
5078 return CM_SUCCESS;
5079 }
5080
5081 //*-----------------------------------------------------------------------------
5082 //| Purpose: Set threadspace for kernel
5083 //| Returns: Result of the operation.
5084 //| Note: It's exclusive with AssociateThreadGroupSpace()
5085 //*-----------------------------------------------------------------------------
AssociateThreadSpace(CmThreadSpace * & threadSpace)5086 CM_RT_API int32_t CmKernelRT::AssociateThreadSpace(CmThreadSpace *&threadSpace)
5087 {
5088 if( threadSpace == nullptr )
5089 {
5090 CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5091 return CM_INVALID_ARG_VALUE;
5092 }
5093
5094 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5095 if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
5096 {
5097 CmThreadSpaceRT *threadSpaceRTConst = static_cast<CmThreadSpaceRT *>(threadSpace);
5098 if (threadSpaceRTConst == nullptr)
5099 {
5100 CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5101 return CM_INVALID_ARG_VALUE;
5102 }
5103 CmThreadGroupSpace *threadGroupSpace = threadSpaceRTConst->GetThreadGroupSpace();
5104 return AssociateThreadGroupSpace(threadGroupSpace);
5105 }
5106 else
5107 {
5108 if (m_threadGroupSpace != nullptr)
5109 {
5110 CM_ASSERTMESSAGE("Error: It's exclusive with AssociateThreadGroupSpace().");
5111 return CM_INVALID_KERNEL_THREADSPACE;
5112 }
5113 }
5114
5115 bool threadSpaceChanged = false;
5116 if( m_threadSpace )
5117 {
5118 if( m_threadSpace != static_cast<CmThreadSpaceRT *>(threadSpace) )
5119 {
5120 threadSpaceChanged = true;
5121 }
5122 }
5123
5124 m_threadSpace = static_cast<CmThreadSpaceRT *>(threadSpace);
5125
5126 uint32_t threadSpaceWidth = 0;
5127 uint32_t threadSpaceHeight = 0;
5128 m_threadSpace->GetThreadSpaceSize(threadSpaceWidth, threadSpaceHeight);
5129 uint32_t threadCount = threadSpaceWidth * threadSpaceHeight;
5130 if (m_threadCount)
5131 {
5132 // Setting threadCount twice with different values will cause reset of kernels
5133 if (m_threadCount != threadCount)
5134 {
5135 m_threadCount = threadCount;
5136 m_dirty |= CM_KERNEL_DATA_THREAD_COUNT_DIRTY;
5137 }
5138 }
5139 else // first time
5140 {
5141 m_threadCount = threadCount;
5142 }
5143
5144 if( threadSpaceChanged )
5145 {
5146 m_threadSpace->SetDirtyStatus( CM_THREAD_SPACE_DATA_DIRTY);
5147 }
5148
5149 return CM_SUCCESS;
5150 }
5151
5152 //*-----------------------------------------------------------------------------
5153 //| Purpose: Set thread group space for kernel
5154 //| Returns: Result of the operation.
5155 //| Note: It's exclusive with AssociateThreadSpace()
5156 //*-----------------------------------------------------------------------------
AssociateThreadGroupSpace(CmThreadGroupSpace * & threadGroupSpace)5157 CM_RT_API int32_t CmKernelRT::AssociateThreadGroupSpace(CmThreadGroupSpace *&threadGroupSpace)
5158 {
5159 if( threadGroupSpace == nullptr )
5160 {
5161 CM_ASSERTMESSAGE("Error: Invalid null pointer.");
5162 return CM_INVALID_ARG_VALUE;
5163 }
5164
5165 if (m_threadSpace != nullptr)
5166 {
5167 CM_ASSERTMESSAGE("Error: It's exclusive with AssociateThreadSpace().");
5168 return CM_INVALID_KERNEL_THREADGROUPSPACE;
5169 }
5170
5171 m_threadGroupSpace = threadGroupSpace;
5172
5173 return CM_SUCCESS;
5174 }
5175
5176 //*-----------------------------------------------------------------------------
5177 //| Purpose: Clear threadspace for kernel
5178 //| Returns: Result of the operation.
5179 //*-----------------------------------------------------------------------------
DeAssociateThreadSpace(CmThreadSpace * & threadSpace)5180 CM_RT_API int32_t CmKernelRT::DeAssociateThreadSpace(CmThreadSpace * &threadSpace)
5181 {
5182 if (threadSpace == nullptr)
5183 {
5184 CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5185 return CM_NULL_POINTER;
5186 }
5187
5188 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5189 if (cmHalState->cmHalInterface->CheckMediaModeAvailability() == false)
5190 {
5191 CmThreadSpaceRT *threadSpaceRTConst = static_cast<CmThreadSpaceRT *>(threadSpace);
5192 if (threadSpaceRTConst == nullptr)
5193 {
5194 CM_ASSERTMESSAGE("Error: Pointer to thread space is null.");
5195 return CM_INVALID_ARG_VALUE;
5196 }
5197
5198 CmThreadGroupSpace *threadGroupSpace = threadSpaceRTConst->GetThreadGroupSpace();
5199 if (m_threadGroupSpace != threadGroupSpace)
5200 {
5201 CM_ASSERTMESSAGE("Error: Invalid thread group space handle.");
5202 return CM_INVALID_ARG_VALUE;
5203 }
5204 m_threadGroupSpace = nullptr;
5205 }
5206 else
5207 {
5208 if (m_threadSpace != static_cast<CmThreadSpaceRT *>(threadSpace))
5209 {
5210 CM_ASSERTMESSAGE("Error: Invalid thread space handle.");
5211 return CM_INVALID_ARG_VALUE;
5212 }
5213 m_threadSpace = nullptr;
5214 }
5215
5216 return CM_SUCCESS;
5217 }
5218 //*--------------------------------------------------------------------------------------------
5219 //| Purpose: query spill memory size, the function can only take effect when jitter is enabled
5220 //| Return: Result of the operation.
5221 //*---------------------------------------------------------------------------------------------
5222
QuerySpillSize(uint32_t & spillMemorySize)5223 CM_RT_API int32_t CmKernelRT::QuerySpillSize(uint32_t &spillMemorySize)
5224 {
5225 CM_KERNEL_INFO *kernelInfo = nullptr;
5226 int32_t kernelStartIndex = m_program->GetKernelStartIndex();
5227
5228 int32_t hr = m_program->GetKernelInfo(m_kernelIndexInProgram, kernelInfo);
5229 if (hr != CM_SUCCESS || kernelInfo == nullptr)
5230 return hr;
5231
5232 if (m_program->IsJitterEnabled()) {
5233 if (kernelInfo->jitInfo != nullptr) {
5234 spillMemorySize = (kernelInfo->jitInfo)->spillMemUsed;
5235 return hr;
5236 }
5237 else
5238 return CM_FAILURE;
5239 }
5240
5241 return CM_FAILURE;
5242 }
5243
5244 //*-----------------------------------------------------------------------------
5245 //| Purpose: Clear threadgroupspace for kernel
5246 //| Returns: Result of the operation.
5247 //*-----------------------------------------------------------------------------
DeAssociateThreadGroupSpace(CmThreadGroupSpace * & threadGroupSpace)5248 int32_t CmKernelRT::DeAssociateThreadGroupSpace(CmThreadGroupSpace * &threadGroupSpace)
5249 {
5250 if (threadGroupSpace == nullptr)
5251 {
5252 CM_ASSERTMESSAGE("Error: Invalid null pointer.");
5253 return CM_NULL_POINTER;
5254 }
5255 if (m_threadGroupSpace != threadGroupSpace)
5256 {
5257 CM_ASSERTMESSAGE("Error: Invalid thread group space handle.");
5258 return CM_INVALID_ARG_VALUE;
5259 }
5260 m_threadGroupSpace = nullptr;
5261 m_dirty = CM_KERNEL_DATA_THREAD_GROUP_SPACE_DIRTY;
5262
5263 return CM_SUCCESS;
5264 }
5265
5266 //*-----------------------------------------------------------------------------
5267 //| Purpose: Indicate whether thread arg existed.
5268 //| Returns: Result of the operation.
5269 //*-----------------------------------------------------------------------------
IsThreadArgExisted()5270 bool CmKernelRT::IsThreadArgExisted()
5271 {
5272 return m_perThreadArgExists;
5273 }
5274
5275 //*-----------------------------------------------------------------------------
5276 //| Purpose: Get the size of SharedLocalMemory
5277 //| Returns: Result of the operation.
5278 //*-----------------------------------------------------------------------------
GetSLMSize()5279 uint32_t CmKernelRT::GetSLMSize()
5280 {
5281 return (uint32_t)m_kernelInfo->kernelSLMSize;
5282 }
5283
5284 //*-----------------------------------------------------------------------------
5285 //| Purpose: Get the spill size of the kernel from JIT
5286 //| Returns: Result of the operation.
5287 //*-----------------------------------------------------------------------------
GetSpillMemUsed()5288 uint32_t CmKernelRT::GetSpillMemUsed()
5289 {
5290 uint32_t spillSize;
5291
5292 if (m_program->IsJitterEnabled() && m_kernelInfo->jitInfo != nullptr)
5293 {
5294 spillSize = (m_kernelInfo->jitInfo)->spillMemUsed;
5295 }
5296 else
5297 {
5298 // kernel uses "--nojitter" option, don't allocate scratch space
5299 spillSize = 0;
5300 }
5301
5302 return spillSize;
5303 }
5304
SearchAvailableIndirectSurfInfoTableEntry(uint16_t kind,uint32_t surfaceIndex,uint32_t bti)5305 int32_t CmKernelRT::SearchAvailableIndirectSurfInfoTableEntry(uint16_t kind, uint32_t surfaceIndex, uint32_t bti)
5306 {
5307 uint16_t i = 0;
5308 for ( i = 0; i < CM_MAX_STATIC_SURFACE_STATES_PER_BT; i++ )
5309 {
5310 if ( ( ( m_IndirectSurfaceInfoArray[ i ].surfaceIndex == surfaceIndex ) && ( m_IndirectSurfaceInfoArray[ i ].kind == kind ) && ( m_IndirectSurfaceInfoArray[ i ].bindingTableIndex == bti ) ) ||
5311 ( ( m_IndirectSurfaceInfoArray[ i ].surfaceIndex == 0 ) && ( m_IndirectSurfaceInfoArray[ i ].kind == 0 ) ) )
5312 {
5313 return i;
5314 }
5315 }
5316 // should never reach this
5317 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5318 return CM_FAILURE;
5319 }
5320
5321 //-----------------------------------------------------------------------------------------------------------------
5322 //! Set surface binding table index count for each indirect surface
5323 //! INPUT:
5324 //! 1) Surface format
5325 //! 2) Surface type.
5326 //! OUTPUT:
5327 //! binding table index count
5328 //-----------------------------------------------------------------------------------------------------------------
SetSurfBTINumForIndirectData(CM_SURFACE_FORMAT format,CM_ENUM_CLASS_TYPE surfaceType)5329 int32_t CmKernelRT::SetSurfBTINumForIndirectData(CM_SURFACE_FORMAT format, CM_ENUM_CLASS_TYPE surfaceType)
5330 {
5331 if (surfaceType == CM_ENUM_CLASS_TYPE_CMBUFFER_RT)
5332 {
5333 return 1;
5334 }
5335 else
5336 {
5337 if ((format == CM_SURFACE_FORMAT_NV12) ||
5338 (format == CM_SURFACE_FORMAT_P010) ||
5339 (format == CM_SURFACE_FORMAT_P208) ||
5340 (format == CM_SURFACE_FORMAT_P016))
5341 {
5342 return 2;
5343 }
5344 else if (format == CM_SURFACE_FORMAT_422H ||
5345 format == CM_SURFACE_FORMAT_411P ||
5346 format == CM_SURFACE_FORMAT_IMC3 ||
5347 format == CM_SURFACE_FORMAT_422V ||
5348 format == CM_SURFACE_FORMAT_444P)
5349 { // 3 planes surface
5350 return 3;
5351 }
5352 else
5353 {
5354 return 1;
5355 }
5356 }
5357 // should never reach this
5358 CM_ASSERTMESSAGE("Error: Set surface binding table index count failure.");
5359 return 0;
5360 }
5361
5362 //-----------------------------------------------------------------------------------------------------------------
5363 //! Set surface binding table index by user.
5364 //! If application hope to assign a specific binding table index for a surface, it should call this function.
5365 //! The assigned binding table index should be an valid value for general surface ( say >=1 and <=242),
5366 //! otherwise, this call will return failure.
5367 //! INPUT:
5368 //! 1) Surface whose binding table index need be set.
5369 //! 2) Assiend binding table index.
5370 //! OUTPUT:
5371 //! CM_SUCCESS
5372 //! CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX if the surface index is not a valid binding table index (valid: 1~242)
5373 //! CM_FAILURE otherwise
5374 //-----------------------------------------------------------------------------------------------------------------
SetSurfaceBTI(SurfaceIndex * surface,uint32_t btIndex)5375 CM_RT_API int32_t CmKernelRT::SetSurfaceBTI(SurfaceIndex* surface, uint32_t btIndex)
5376 {
5377
5378 uint32_t width, height, bytesPerPixel;
5379 CM_SURFACE_FORMAT format = CM_SURFACE_FORMAT_INVALID;
5380 //Sanity check
5381 if (surface == nullptr)
5382 {
5383 CM_ASSERTMESSAGE("Error: Pointer to surface is null.");
5384 return CM_NULL_POINTER;
5385 }
5386 if (!m_surfaceMgr->IsValidSurfaceIndex(btIndex))
5387 {
5388 CM_ASSERTMESSAGE("Error: Invalid binding table index.");
5389 return CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX;
5390 }
5391
5392 //Sanity check: if the BTI has been used once enqueue
5393 uint32_t i = 0;
5394 for (i = 0; i < m_usKernelPayloadSurfaceCount; i++)
5395 {
5396 if (m_IndirectSurfaceInfoArray[i].bindingTableIndex == (uint16_t)btIndex)
5397 {
5398 CM_ASSERTMESSAGE("Error: Binding table index has been used once enqueue.");
5399 return CM_KERNELPAYLOAD_SURFACE_INVALID_BTINDEX;
5400 }
5401 }
5402
5403 uint32_t index = surface->get_data();
5404 uint32_t handle = 0;
5405
5406 CmSurface* surfaceRT = nullptr;
5407 m_surfaceMgr->GetSurface( index, surfaceRT );
5408 if(surfaceRT == nullptr)
5409 {
5410 CM_ASSERTMESSAGE("Error: Invalid surface.");
5411 return CM_NULL_POINTER;
5412 }
5413
5414 CmSurface2DRT* surf2D = nullptr;
5415 uint32_t indirectSurfInfoEntry = 0;
5416 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACE2D )
5417 {
5418 surf2D = static_cast< CmSurface2DRT* >( surfaceRT );
5419 surf2D->GetHandle( handle );
5420 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_2D, handle, btIndex);
5421 if (indirectSurfInfoEntry == CM_FAILURE)
5422 {
5423 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5424 return CM_FAILURE;
5425 }
5426 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_2D;
5427 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5428 surf2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5429 }
5430 else
5431 {
5432 CmBuffer_RT* cmBuffer = nullptr;
5433 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMBUFFER_RT )
5434 {
5435 cmBuffer = static_cast< CmBuffer_RT* >( surfaceRT );
5436 cmBuffer->GetHandle( handle );
5437 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_1D, handle, btIndex);
5438 if (indirectSurfInfoEntry == CM_FAILURE)
5439 {
5440 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5441 return CM_FAILURE;
5442 }
5443 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_1D;
5444 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5445 }
5446 else
5447 {
5448 CmSurface2DUPRT* surf2DUP = nullptr;
5449 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACE2DUP )
5450 {
5451 surf2DUP = static_cast< CmSurface2DUPRT* >( surfaceRT );
5452 surf2DUP->GetHandle( handle );
5453 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_2D_UP, handle, btIndex);
5454 if (indirectSurfInfoEntry == CM_FAILURE)
5455 {
5456 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5457 return CM_FAILURE;
5458 }
5459 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_2D_UP;
5460 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5461 surf2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5462 }
5463 else
5464 {
5465 CmSurfaceSampler* surfSampler = nullptr;
5466 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER )
5467 {
5468 surfSampler = static_cast< CmSurfaceSampler* >(surfaceRT);
5469
5470 //Get actually SurfaceIndex ID for 2D
5471 uint16_t surfIndexForCurrent = 0;
5472 surfSampler->GetCmIndexCurrent(surfIndexForCurrent);
5473 CmSurface* surfSampRT= nullptr;
5474 m_surfaceMgr->GetSurface(surfIndexForCurrent, surfSampRT);
5475 if(surfSampRT == nullptr)
5476 {
5477 CM_ASSERTMESSAGE("Error: Invalid surface.");
5478 return CM_NULL_POINTER;
5479 }
5480
5481 SAMPLER_SURFACE_TYPE surfaceType;
5482 surfSampler->GetSurfaceType(surfaceType);
5483 surfSampler->GetHandle( handle );
5484 if ( surfaceType == SAMPLER_SURFACE_TYPE_2D )
5485 {
5486 CmSurface2DRT* surfSamp2D = nullptr;
5487 surfSamp2D = static_cast<CmSurface2DRT*>(surfSampRT);
5488 surfSamp2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5489
5490 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER, handle, btIndex);
5491 if (indirectSurfInfoEntry == CM_FAILURE)
5492 {
5493 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5494 return CM_FAILURE;
5495 }
5496 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER;
5497 }
5498 else if ( surfaceType == SAMPLER_SURFACE_TYPE_2DUP )
5499 {
5500 CmSurface2DUPRT* surfSamp2DUP = nullptr;
5501 surfSamp2DUP = static_cast<CmSurface2DUPRT*>(surfSampRT);
5502 surfSamp2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5503
5504 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE2DUP_SAMPLER, handle, btIndex);
5505 if (indirectSurfInfoEntry == CM_FAILURE)
5506 {
5507 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5508 return CM_FAILURE;
5509 }
5510 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE2DUP_SAMPLER;
5511 }
5512 else if ( surfaceType == SAMPLER_SURFACE_TYPE_3D )
5513 {
5514 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_3D, handle, btIndex);
5515 if (indirectSurfInfoEntry == CM_FAILURE)
5516 {
5517 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5518 return CM_FAILURE;
5519 }
5520 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_3D;
5521 }
5522 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5523 }
5524 else
5525 {
5526 CmSurfaceSampler8x8* surfSampler8x8 = nullptr;
5527 if ( surfaceRT->Type() == CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8 )
5528 {
5529 surfSampler8x8 = static_cast< CmSurfaceSampler8x8* >( surfaceRT );
5530 surfSampler8x8->GetIndexCurrent( handle );
5531
5532 //Get actually SurfaceIndex ID for 2D
5533 uint16_t surfIndexForCurrent = 0;
5534 surfSampler8x8->GetCmIndex(surfIndexForCurrent);
5535 CmSurface* surfSamp8x8RT = nullptr;
5536 m_surfaceMgr->GetSurface(surfIndexForCurrent, surfSamp8x8RT);
5537 if(surfSamp8x8RT == nullptr)
5538 {
5539 CM_ASSERTMESSAGE("Error: Invalid surface.");
5540 return CM_NULL_POINTER;
5541 }
5542
5543 CmSurface2DRT* surfSamp8x82D = nullptr;
5544 surfSamp8x82D = static_cast<CmSurface2DRT*>(surfSamp8x8RT);
5545 surfSamp8x82D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5546
5547 if ( surfSampler8x8->GetSampler8x8SurfaceType() == CM_AVS_SURFACE )
5548 {
5549 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER8X8_AVS, handle, btIndex);
5550 if (indirectSurfInfoEntry == CM_FAILURE)
5551 {
5552 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5553 return CM_FAILURE;
5554 }
5555 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER8X8_AVS;
5556 }
5557 else if ( surfSampler8x8->GetSampler8x8SurfaceType() == CM_VA_SURFACE )
5558 {
5559 indirectSurfInfoEntry = SearchAvailableIndirectSurfInfoTableEntry(ARG_KIND_SURFACE_SAMPLER8X8_VA, handle, btIndex);
5560 if (indirectSurfInfoEntry == CM_FAILURE)
5561 {
5562 CM_ASSERTMESSAGE("Error: Can not get available indirect surface info table entry.");
5563 return CM_FAILURE;
5564 }
5565 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].kind = ARG_KIND_SURFACE_SAMPLER8X8_VA;
5566 }
5567 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].surfaceIndex = (uint16_t)handle;
5568 }
5569 else
5570 {
5571 return CM_FAILURE;
5572 }
5573 }
5574 }
5575 }
5576 }
5577
5578 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].bindingTableIndex = (uint16_t)btIndex;
5579 if (SetSurfBTINumForIndirectData(format, surfaceRT->Type())== 0)
5580 {
5581 CM_ASSERTMESSAGE("Error: Set surface binding table index count failure.");
5582 return CM_FAILURE;
5583 }
5584 m_IndirectSurfaceInfoArray[indirectSurfInfoEntry].numBTIPerSurf = (uint16_t)SetSurfBTINumForIndirectData(format, surfaceRT->Type());
5585
5586 //Copy it to surface index array
5587
5588 m_pKernelPayloadSurfaceArray[indirectSurfInfoEntry] = surface;
5589
5590
5591 // count is actally one larger than the actual index
5592 m_usKernelPayloadSurfaceCount = indirectSurfInfoEntry + 1;
5593 m_dirty |= (CM_KERNEL_DATA_PAYLOAD_DATA_DIRTY | CM_KERNEL_DATA_PAYLOAD_DATA_SIZE_DIRTY);
5594 return CM_SUCCESS;
5595 }
5596
GetKernelIndex()5597 uint32_t CmKernelRT::GetKernelIndex()
5598 {
5599 return m_kernelIndex;
5600 }
GetKernelGenxBinarySize(void)5601 uint32_t CmKernelRT::GetKernelGenxBinarySize(void)
5602 {
5603 if(m_kernelInfo == nullptr)
5604 {
5605 CM_ASSERTMESSAGE("Error: Invalid kernel genx binary size.");
5606 return 0;
5607 }
5608 else
5609 {
5610 return m_kernelInfo->genxBinarySize;
5611 }
5612 }
5613
5614 //-----------------------------------------------------------------------------------------------------------------
5615 //! Map Surface type to Kernel arg Kind.
5616 //! INPUT: Surface type :CM_ENUM_CLASS_TYPE
5617 //! OUTPUT: Kernel arg Kind :CM_ARG_KIND
5618 //-----------------------------------------------------------------------------------------------------------------
SurfTypeToArgKind(CM_ENUM_CLASS_TYPE surfType)5619 CM_ARG_KIND CmKernelRT::SurfTypeToArgKind(CM_ENUM_CLASS_TYPE surfType)
5620 {
5621 switch(surfType)
5622 {
5623 case CM_ENUM_CLASS_TYPE_CMBUFFER_RT :return ARG_KIND_SURFACE_1D;
5624 case CM_ENUM_CLASS_TYPE_CMSURFACE2D :return ARG_KIND_SURFACE_2D;
5625 case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP :return ARG_KIND_SURFACE_2D_UP;
5626 case CM_ENUM_CLASS_TYPE_CMSURFACE3D :return ARG_KIND_SURFACE_3D;
5627 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER :return ARG_KIND_SURFACE_SAMPLER;
5628 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8 :return ARG_KIND_SURFACE_SAMPLER8X8_AVS;
5629 case CM_ENUM_CLASS_TYPE_CMSURFACEVME :return ARG_KIND_SURFACE_VME;
5630 case CM_ENUM_CLASS_TYPE_CMSAMPLER_RT :return ARG_KIND_SAMPLER;
5631 case CM_ENUM_CLASS_TYPE_CMSAMPLER8X8STATE_RT :return ARG_KIND_SAMPLER;
5632 case CM_ENUM_CLASS_TYPE_CM_STATE_BUFFER :return ARG_KIND_STATE_BUFFER;
5633
5634 default:
5635 CM_ASSERTMESSAGE("Error: Invalid surface type.");
5636 break;
5637 }
5638 return ARG_KIND_GENERAL;
5639 }
5640
CalculateKernelSurfacesNum(uint32_t & kernelSurfaceNum,uint32_t & neededBTEntryNum)5641 int32_t CmKernelRT::CalculateKernelSurfacesNum(uint32_t& kernelSurfaceNum, uint32_t& neededBTEntryNum)
5642 {
5643 uint32_t surfaceArraySize = 0;
5644 CmSurface* surf = nullptr;
5645 CmSurface2DRT* surf2D = nullptr;
5646 CmSurface2DUPRT* surf2DUP = nullptr;
5647 uint32_t width, height, bytesPerPixel;
5648 CM_SURFACE_FORMAT format;
5649 uint32_t maxBTIndex = 0;
5650
5651 kernelSurfaceNum = 0;
5652 neededBTEntryNum = 0;
5653
5654 surfaceArraySize = m_surfaceMgr->GetSurfacePoolSize();
5655
5656 //Calculate surface number and needed binding table entries
5657 for (uint32_t surfIndex = 0; surfIndex <= m_maxSurfaceIndexAllocated; surfIndex ++)
5658 {
5659 if (m_surfaceArray[surfIndex%surfaceArraySize])
5660 {
5661 surf = nullptr;
5662 m_surfaceMgr->GetSurface(surfIndex, surf);
5663 if (surf)
5664 {
5665 switch(surf->Type())
5666 {
5667 case CM_ENUM_CLASS_TYPE_CMBUFFER_RT:
5668 case CM_ENUM_CLASS_TYPE_CMSURFACE3D:
5669 kernelSurfaceNum ++;
5670 neededBTEntryNum ++;
5671 break;
5672
5673 case CM_ENUM_CLASS_TYPE_CMSURFACEVME:
5674 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER:
5675 case CM_ENUM_CLASS_TYPE_CMSURFACESAMPLER8X8:
5676 //virtual surface, no need increase count
5677 break;
5678
5679 case CM_ENUM_CLASS_TYPE_CMSURFACE2D:
5680 kernelSurfaceNum++;
5681 surf2D = static_cast<CmSurface2DRT*>(surf);
5682 format = CM_SURFACE_FORMAT_INVALID;
5683 surf2D->GetSurfaceDesc(width, height, format, bytesPerPixel);
5684 if ((format == CM_SURFACE_FORMAT_NV12) ||
5685 (format == CM_SURFACE_FORMAT_P010) ||
5686 (format == CM_SURFACE_FORMAT_P208) ||
5687 (format == CM_SURFACE_FORMAT_P016))
5688 {
5689 neededBTEntryNum += 2;
5690 }
5691 else if (format == CM_SURFACE_FORMAT_422H ||
5692 format == CM_SURFACE_FORMAT_411P ||
5693 format == CM_SURFACE_FORMAT_IMC3 ||
5694 format == CM_SURFACE_FORMAT_422V ||
5695 format == CM_SURFACE_FORMAT_444P)
5696 { // 3 planes surface
5697 neededBTEntryNum += 3;
5698 }
5699 else
5700 {
5701 neededBTEntryNum += 1;
5702 }
5703 break;
5704
5705 case CM_ENUM_CLASS_TYPE_CMSURFACE2DUP:
5706 kernelSurfaceNum++;
5707 surf2DUP = static_cast<CmSurface2DUPRT*>(surf);
5708 format = CM_SURFACE_FORMAT_INVALID;
5709 surf2DUP->GetSurfaceDesc(width, height, format, bytesPerPixel);
5710 if ((format == CM_SURFACE_FORMAT_NV12) ||
5711 (format == CM_SURFACE_FORMAT_P010) ||
5712 (format == CM_SURFACE_FORMAT_P208) ||
5713 (format == CM_SURFACE_FORMAT_P016))
5714 {
5715 neededBTEntryNum += 2;
5716 }
5717 else if (format == CM_SURFACE_FORMAT_422H ||
5718 format == CM_SURFACE_FORMAT_411P ||
5719 format == CM_SURFACE_FORMAT_IMC3 ||
5720 format == CM_SURFACE_FORMAT_422V ||
5721 format == CM_SURFACE_FORMAT_444P)
5722 { // 3 planes surface
5723 neededBTEntryNum += 3;
5724 }
5725 else
5726 {
5727 neededBTEntryNum += 1;
5728 }
5729 break;
5730
5731 default:
5732 break;
5733 }
5734 }
5735 }
5736 }
5737
5738 if ((maxBTIndex + 1) > neededBTEntryNum)
5739 {
5740 neededBTEntryNum = maxBTIndex + 1;
5741 }
5742
5743 //Wordaround: the calculation maybe not accurate if the VME surfaces are existed
5744 neededBTEntryNum += m_vmeSurfaceCount;
5745
5746 return CM_SUCCESS;
5747 }
5748
5749 //*-----------------------------------------------------------------------------
5750 //| Purpose: Get aligned curbe size for different platforms
5751 //| Returns: Result of operation.
5752 //*-----------------------------------------------------------------------------
GetAlignedCurbeSize(uint32_t value)5753 uint32_t CmKernelRT::GetAlignedCurbeSize(uint32_t value)
5754 {
5755 uint32_t curbeAlignedSize = 0;
5756
5757 curbeAlignedSize = MOS_ALIGN_CEIL(value, RENDERHAL_CURBE_BLOCK_ALIGN);
5758 return curbeAlignedSize;
5759 }
5760
5761 #if CM_LOG_ON
Log()5762 std::string CmKernelRT::Log()
5763 {
5764
5765 std::ostringstream oss;
5766
5767 oss << " Kernel Name:" << m_kernelInfo->kernelName << std::endl
5768 << " Kernel Binary Size:" << m_kernelInfo->jitBinarySize
5769 << " Index In Task:" << m_indexInTask
5770 << " Thread Count:" << m_threadCount
5771 << " Curbe Size:" << m_sizeInCurbe
5772 << " Kernel arg Count:" << m_argCount
5773 << std::endl;
5774
5775 // Per Kernel Thread Space Log
5776 if(m_threadSpace)
5777 {
5778 oss << m_threadSpace->Log();
5779 }
5780
5781 // Per Kernel Thread Group Space Log
5782 if(m_threadGroupSpace)
5783 {
5784 oss << m_threadGroupSpace->Log();
5785 }
5786
5787 // Arguments Log
5788 for (uint32_t argIndex= 0; argIndex< m_argCount; argIndex++ )
5789 {
5790 if (m_args[argIndex].value) // filter out the implicit arguments
5791 {
5792 ArgLog(oss, argIndex, m_args[argIndex]);
5793 }
5794 }
5795
5796 return oss.str();
5797 }
5798
ArgLog(std::ostringstream & oss,uint32_t index,CM_ARG arg)5799 void CmKernelRT::ArgLog(std::ostringstream &oss, uint32_t index, CM_ARG arg)
5800 {
5801
5802 oss << "[" << index << "] th Argument"
5803 << " Type :" << arg.unitKind
5804 << " Count:" << arg.unitCount
5805 << " Size:" << arg.unitSize
5806 << " Surface Kind:" << (int)arg.surfaceKind
5807 << " OffsetInPayload:" << arg.unitOffsetInPayload
5808 << " OffsetInPayloadOrig:" << arg.unitOffsetInPayloadOrig << "";
5809
5810 CmLogger::LogDataArrayHex( oss, arg.value, arg.unitSize * arg.unitCount);
5811
5812 if (CHECK_SURFACE_TYPE(arg.unitKind,
5813 ARG_KIND_SURFACE_1D,
5814 ARG_KIND_SURFACE_2D,
5815 ARG_KIND_SURFACE_2D_UP,
5816 ARG_KIND_SURFACE_VME,
5817 ARG_KIND_SURFACE_SAMPLER,
5818 ARG_KIND_SURFACE_3D,
5819 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
5820 ARG_KIND_SURFACE_SAMPLER8X8_VA,
5821 ARG_KIND_SURFACE2DUP_SAMPLER))
5822 {
5823 uint16_t numSurfaces = arg.unitSize / sizeof(uint32_t);
5824 if (arg.unitKind == ARG_KIND_SURFACE_VME)
5825 {
5826 numSurfaces = (arg.unitSize - sizeof(CM_HAL_VME_ARG_VALUE) * arg.unitVmeArraySize) / sizeof(uint32_t) + arg.unitVmeArraySize;
5827 }
5828 for (uint16_t i = 0; i < numSurfaces; i++)
5829 {
5830 uint32_t surfaceIndex = *(uint16_t *)(arg.surfIndex + i);
5831
5832 if(surfaceIndex == CM_NULL_SURFACE)
5833 continue;
5834
5835 CmSurface *surf = nullptr;
5836 m_surfaceMgr->GetSurface(surfaceIndex, surf);
5837 if (surf == nullptr)
5838 {
5839 continue;
5840 }
5841 surf->Log(oss);
5842 }
5843 }
5844 }
5845
GetHalState()5846 CM_HAL_STATE* CmKernelRT::GetHalState() { return m_device->GetHalState(); }
5847
5848 #endif // #if CM_LOG_ON
5849
SurfaceDump(uint32_t kernelNumber,int32_t taskId)5850 void CmKernelRT::SurfaceDump(uint32_t kernelNumber, int32_t taskId)
5851 {
5852 #if MDF_SURFACE_CONTENT_DUMP
5853 CM_ARG arg;
5854
5855 for (uint32_t argIndex = 0; argIndex< m_argCount; argIndex++)
5856 {
5857 arg = m_args[argIndex];
5858 if (CHECK_SURFACE_TYPE(arg.unitKind,
5859 ARG_KIND_SURFACE_1D,
5860 ARG_KIND_SURFACE_2D,
5861 ARG_KIND_SURFACE_2D_UP,
5862 ARG_KIND_SURFACE_VME,
5863 ARG_KIND_SURFACE_SAMPLER,
5864 ARG_KIND_SURFACE_3D,
5865 ARG_KIND_SURFACE_SAMPLER8X8_AVS,
5866 ARG_KIND_SURFACE_SAMPLER8X8_VA,
5867 ARG_KIND_SURFACE2DUP_SAMPLER))
5868 {
5869 uint16_t numSurfaces = arg.unitSize / sizeof(uint32_t);
5870 if (arg.unitKind == ARG_KIND_SURFACE_VME)
5871 {
5872 numSurfaces = (arg.unitSize - sizeof(CM_HAL_VME_ARG_VALUE) * arg.unitVmeArraySize) / sizeof(uint32_t) + arg.unitVmeArraySize;
5873 }
5874
5875 for (uint16_t i = 0; i < numSurfaces; i++)
5876 {
5877 uint32_t surfaceIndex = *(uint16_t *)(arg.surfIndex + i);
5878 CmSurface *surf = nullptr;
5879 m_surfaceMgr->GetSurface(surfaceIndex, surf);
5880 if (surf == nullptr)
5881 {
5882 return;
5883 }
5884 surf->DumpContent(kernelNumber, m_kernelInfo->kernelName, taskId, argIndex, i);
5885 }
5886 }
5887 }
5888 #endif
5889 }
5890
SetSamplerBTI(SamplerIndex * sampler,uint32_t nIndex)5891 CM_RT_API int32_t CmKernelRT::SetSamplerBTI(SamplerIndex* sampler, uint32_t nIndex)
5892 {
5893 if (!sampler)
5894 {
5895 return CM_NULL_POINTER;
5896 }
5897 if (CM_SAMPLER_MAX_BINDING_INDEX < nIndex)
5898 {
5899 return CM_KERNELPAYLOAD_SAMPLER_INVALID_BTINDEX;
5900 }
5901
5902 uint32_t samplerIndex = sampler->get_data();
5903 PCM_HAL_STATE cmHalState = ((PCM_CONTEXT_DATA)m_device->GetAccelData())->cmHalState;
5904
5905 uint32_t i = 0;
5906 for (i = 0; i < m_samplerBtiCount; i++)
5907 {
5908 if ((m_samplerBtiEntry[i].samplerIndex == samplerIndex) && (m_samplerBtiEntry[i].samplerBTI == nIndex))
5909 {
5910 break;
5911 }
5912 if (m_dirty & cMKERNELDATASAMPLERBTIDIRTY)
5913 {
5914 if ((m_samplerBtiEntry[i].samplerIndex != samplerIndex) && (m_samplerBtiEntry[i].samplerBTI == nIndex))
5915 {
5916 if (cmHalState->useNewSamplerHeap)
5917 {
5918 SamplerParam sampler1 = {};
5919 SamplerParam sampler2 = {};
5920 cmHalState->cmHalInterface->GetSamplerParamInfoForSamplerType(&cmHalState->samplerTable[m_samplerBtiEntry[i].samplerIndex], sampler1);
5921 cmHalState->cmHalInterface->GetSamplerParamInfoForSamplerType(&cmHalState->samplerTable[samplerIndex], sampler2);
5922
5923 if (sampler1.elementType== sampler2.elementType)
5924 {
5925 // return failure only if the two samplers have the same type, because different type samplers are able to set to the same BTI
5926 return CM_FAILURE;
5927 }
5928 }
5929 else
5930 {
5931 return CM_FAILURE;
5932 }
5933 }
5934
5935 CmSampler8x8State_RT *sampler8x8 = nullptr;
5936 CmSampler8x8State_RT *tmpSampler8x8 = nullptr;
5937 m_device->GetSampler8x8(samplerIndex, sampler8x8);
5938 m_device->GetSampler8x8(m_samplerBtiEntry[i].samplerIndex, tmpSampler8x8);
5939
5940 if (sampler8x8 && tmpSampler8x8 && (sampler8x8->GetStateType() == CM_SAMPLER8X8_AVS)
5941 && (tmpSampler8x8->GetStateType() == CM_SAMPLER8X8_AVS) &&
5942 cmHalState->cmHalInterface->IsAdjacentSamplerIndexRequiredbyHw())
5943 {
5944 if ((m_samplerBtiEntry[i].samplerIndex != samplerIndex) &&
5945 ((m_samplerBtiEntry[i].samplerBTI == nIndex + 1) || (m_samplerBtiEntry[i].samplerBTI == nIndex - 1)))
5946 return CM_FAILURE;
5947 }
5948 }
5949 }
5950
5951 if (i >= CM_MAX_SAMPLER_TABLE_SIZE)
5952 {
5953 CM_ASSERTMESSAGE("Error: Exceed maximum sampler table size.");
5954 return CM_FAILURE;
5955 }
5956
5957 if (i == m_samplerBtiCount)
5958 {
5959 m_samplerBtiEntry[i].samplerIndex = samplerIndex;
5960 m_samplerBtiEntry[i].samplerBTI = nIndex;
5961
5962 m_samplerBtiCount = i + 1;
5963
5964 m_dirty |= cMKERNELDATASAMPLERBTIDIRTY;
5965 }
5966 return CM_SUCCESS;
5967 }
5968
GetBinary(std::vector<char> & binary)5969 CMRT_UMD_API int32_t CmKernelRT::GetBinary(std::vector<char>& binary)
5970 {
5971 binary.resize(m_binarySize);
5972
5973 CmSafeMemCopy((void *)&binary[0], (void *)m_binary, m_binarySize);
5974
5975 return CM_SUCCESS;
5976 }
5977
ReplaceBinary(std::vector<char> & binary)5978 CMRT_UMD_API int32_t CmKernelRT::ReplaceBinary(std::vector<char>& binary)
5979 {
5980 uint32_t size = binary.size();
5981
5982 if (size == 0)
5983 {
5984 return CM_INVALID_ARG_VALUE;
5985 }
5986
5987 if(m_binaryOrig == nullptr)
5988 {
5989 //Store the orignal binary once.
5990 m_binaryOrig = m_binary;
5991 m_binarySizeOrig = m_binarySize;
5992 }
5993
5994 m_binary = MOS_NewArray(char, size);
5995 CmSafeMemCopy((void *)m_binary, (void *)&binary[0], size);
5996
5997 m_binarySize = size;
5998
5999 return CM_SUCCESS;
6000 }
6001
ResetBinary()6002 CMRT_UMD_API int32_t CmKernelRT::ResetBinary()
6003 {
6004 if (m_binaryOrig == nullptr)
6005 {
6006 //ReplaceBinary is never called
6007 return CM_SUCCESS;
6008 }
6009 if(m_binary!= m_binaryOrig)
6010 {
6011 MosSafeDeleteArray(m_binary);
6012 }
6013 m_binary = m_binaryOrig;
6014 m_binarySize = m_binarySizeOrig;
6015
6016 return CM_SUCCESS;
6017 }
6018
UpdateSamplerHeap(CmKernelData * kernelData)6019 int CmKernelRT::UpdateSamplerHeap(CmKernelData *kernelData)
6020 {
6021 // Get sampler bti & offset
6022 PCM_HAL_KERNEL_PARAM cmKernel = nullptr;
6023 PCM_CONTEXT_DATA cmData = (PCM_CONTEXT_DATA)m_device->GetAccelData();
6024 PCM_HAL_STATE state = cmData->cmHalState;
6025 std::list<SamplerParam>::iterator iter;
6026 unsigned int heapOffset = 0;
6027
6028 if (state->useNewSamplerHeap == false)
6029 {
6030 return CM_SUCCESS;
6031 }
6032
6033 heapOffset = 0;
6034 cmKernel = kernelData->GetHalCmKernelData();
6035 std::list<SamplerParam> *sampler_heap = cmKernel->samplerHeap;
6036
6037 // First pass, inserts sampler with user-defined BTI to the list. Sorts by element order low to high, then by BTI order low to high.
6038 for (unsigned int samplerElementType = MHW_Sampler1Element; samplerElementType < MHW_SamplerTotalElements; samplerElementType++)
6039 {
6040 for (unsigned int n = 0; n < cmKernel->samplerBTIParam.samplerCount; ++n)
6041 {
6042 SamplerParam sampler = {};
6043 sampler.samplerTableIndex = cmKernel->samplerBTIParam.samplerInfo[n].samplerIndex;
6044
6045 if (state->samplerTable[sampler.samplerTableIndex].ElementType == samplerElementType)
6046 {
6047 sampler.bti = cmKernel->samplerBTIParam.samplerInfo[n].samplerBTI;
6048 sampler.userDefinedBti = true;
6049 state->cmHalInterface->GetSamplerParamInfoForSamplerType(&state->samplerTable[sampler.samplerTableIndex], sampler);
6050
6051 // Guarantees each user-defined BTI has a spacing between each other user-defined BTIs larger than the stepping
6052 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6053 {
6054 if (iter->elementType == sampler.elementType)
6055 {
6056 unsigned int diff = (iter->bti > sampler.bti) ? (iter->bti - sampler.bti) : (sampler.bti - iter->bti);
6057 if (diff < sampler.btiStepping)
6058 {
6059 CM_ASSERTMESSAGE("Sampler BTI setting error. Confliction with other Sampler BTI.\n");
6060 return MOS_STATUS_INVALID_PARAMETER;
6061 }
6062 }
6063 }
6064
6065 // Inserts by the order
6066 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6067 {
6068 if (iter->elementType > sampler.elementType)
6069 {
6070 break;
6071 }
6072 else if ((iter->elementType == sampler.elementType) && (iter->bti > sampler.bti))
6073 {
6074 break;
6075 }
6076 }
6077 sampler.heapOffset = sampler.bti * sampler.btiMultiplier;
6078 sampler_heap->insert(iter, sampler);
6079 }
6080 }
6081 }
6082
6083 // Second pass, loops over all kernel/thread args, find regular sampler and insert to sampler heap.
6084 // Follows the existing sorted order.
6085 for (unsigned int samplerElementType = MHW_Sampler1Element; samplerElementType < MHW_SamplerTotalElements; samplerElementType++)
6086 {
6087 for (unsigned int index = 0; index < cmKernel->numArgs; index++)
6088 {
6089 PCM_HAL_KERNEL_ARG_PARAM argParam = &cmKernel->argParams[index];
6090 if (argParam->isNull)
6091 {
6092 continue;
6093 }
6094
6095 for (unsigned int threadIndex = 0; threadIndex < argParam->unitCount; threadIndex++)
6096 {
6097 if (argParam->kind == CM_ARGUMENT_SAMPLER)
6098 {
6099 unsigned char *arg = argParam->firstValue + (threadIndex * argParam->unitSize);
6100 unsigned int samplerTableIndex = *((uint32_t *)arg);
6101
6102 SamplerParam sampler = {};
6103 sampler.samplerTableIndex = samplerTableIndex;
6104 state->cmHalInterface->GetSamplerParamInfoForSamplerType(&state->samplerTable[sampler.samplerTableIndex], sampler);
6105 sampler.regularBti = true;
6106
6107 if (sampler.elementType != samplerElementType)
6108 {
6109 continue;
6110 }
6111
6112 // if the sampler is already in the heap, skip
6113 bool isDuplicate = false;
6114 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6115 {
6116 if (iter->samplerTableIndex == sampler.samplerTableIndex)
6117 {
6118 isDuplicate = true;
6119 iter->regularBti = true;
6120 break;
6121 }
6122 }
6123 if (isDuplicate == true)
6124 {
6125 continue;
6126 }
6127
6128 // insert the new sampler to the heap
6129 heapOffset = 0;
6130 for (iter = sampler_heap->begin(); iter != sampler_heap->end(); ++iter)
6131 {
6132 if (iter->elementType == sampler.elementType)
6133 {
6134 // Needs to keep the inserted sampler's correctness, so do not insert before same element regular sampler
6135 // Only insert before user-defined BTI
6136 if (iter->userDefinedBti == true)
6137 {
6138 unsigned int curOffset = iter->heapOffset;
6139 if (heapOffset > curOffset)
6140 {
6141 // Confliction, which means that sampler heap in smaller
6142 // element type has excced the position which is supposed
6143 // to put this user-defined BTI sampler.
6144 // User needs to set the BTI to a larger value.
6145 CM_ASSERTMESSAGE("Sampler BTI setting error. Confliction with other Sampler BTI.\n");
6146 return MOS_STATUS_INVALID_PARAMETER;
6147 }
6148 else
6149 {
6150 if (curOffset - heapOffset >= sampler.btiStepping * sampler.btiMultiplier)
6151 {
6152 break;
6153 }
6154 else
6155 {
6156 heapOffset = curOffset + iter->btiStepping * iter->btiMultiplier;
6157 }
6158 }
6159 }
6160 else
6161 {
6162 heapOffset += iter->btiStepping * iter->btiMultiplier;
6163 }
6164 }
6165 else if (iter->elementType > sampler.elementType)
6166 {
6167 break;
6168 }
6169 else
6170 {
6171 heapOffset = iter->heapOffset + iter->size;
6172 std::list<SamplerParam>::iterator iter_next = std::next(iter, 1);
6173 if ((iter_next != sampler_heap->end()) && (iter_next->elementType > iter->elementType))
6174 {
6175 // Aligns heapOffset to next nearest multiple of sampler size if next sampler is a different element type
6176 heapOffset = (heapOffset + iter_next->btiStepping * iter_next->btiMultiplier - 1) / (iter_next->btiStepping * iter_next->btiMultiplier) * (iter_next->btiStepping * iter_next->btiMultiplier);
6177 }
6178 }
6179 }
6180
6181 if(!sampler.btiMultiplier)
6182 {
6183 CM_ASSERTMESSAGE("Sampler BTI setting error. Multiplier cannot be zero!\n");
6184 return MOS_STATUS_INVALID_PARAMETER;
6185 }
6186
6187 if (iter == sampler_heap->end())
6188 {
6189 // Aligns heapOffset to next nearest multiple of sampler size if next sampler is a different element type
6190 heapOffset = (heapOffset + sampler.btiStepping * sampler.btiMultiplier - 1) / (sampler.btiStepping * sampler.btiMultiplier) * (sampler.btiStepping * sampler.btiMultiplier);
6191 }
6192 sampler.heapOffset = heapOffset;
6193 sampler.bti = sampler.heapOffset / sampler.btiMultiplier;
6194 sampler_heap->insert(iter, sampler);
6195 }
6196 }
6197 }
6198 }
6199
6200 return CM_SUCCESS;
6201 }
6202 }
6203