xref: /aosp_15_r20/external/mesa3d/src/gallium/frontends/rusticl/api/kernel.rs (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 use crate::api::event::create_and_queue;
2 use crate::api::icd::*;
3 use crate::api::util::*;
4 use crate::core::device::*;
5 use crate::core::event::*;
6 use crate::core::kernel::*;
7 use crate::core::memory::*;
8 use crate::core::program::*;
9 use crate::core::queue::*;
10 
11 use mesa_rust_util::ptr::*;
12 use mesa_rust_util::string::*;
13 use rusticl_opencl_gen::*;
14 use rusticl_proc_macros::cl_entrypoint;
15 use rusticl_proc_macros::cl_info_entrypoint;
16 
17 use std::cmp;
18 use std::mem::{self, MaybeUninit};
19 use std::os::raw::c_void;
20 use std::ptr;
21 use std::slice;
22 use std::sync::Arc;
23 
24 #[cl_info_entrypoint(clGetKernelInfo)]
25 impl CLInfo<cl_kernel_info> for cl_kernel {
query(&self, q: cl_kernel_info, _: &[u8]) -> CLResult<Vec<MaybeUninit<u8>>>26     fn query(&self, q: cl_kernel_info, _: &[u8]) -> CLResult<Vec<MaybeUninit<u8>>> {
27         let kernel = Kernel::ref_from_raw(*self)?;
28         Ok(match q {
29             CL_KERNEL_ATTRIBUTES => cl_prop::<&str>(&kernel.kernel_info.attributes_string),
30             CL_KERNEL_CONTEXT => {
31                 let ptr = Arc::as_ptr(&kernel.prog.context);
32                 cl_prop::<cl_context>(cl_context::from_ptr(ptr))
33             }
34             CL_KERNEL_FUNCTION_NAME => cl_prop::<&str>(&kernel.name),
35             CL_KERNEL_NUM_ARGS => cl_prop::<cl_uint>(kernel.kernel_info.args.len() as cl_uint),
36             CL_KERNEL_PROGRAM => {
37                 let ptr = Arc::as_ptr(&kernel.prog);
38                 cl_prop::<cl_program>(cl_program::from_ptr(ptr))
39             }
40             CL_KERNEL_REFERENCE_COUNT => cl_prop::<cl_uint>(Kernel::refcnt(*self)?),
41             // CL_INVALID_VALUE if param_name is not one of the supported values
42             _ => return Err(CL_INVALID_VALUE),
43         })
44     }
45 }
46 
47 #[cl_info_entrypoint(clGetKernelArgInfo)]
48 impl CLInfoObj<cl_kernel_arg_info, cl_uint> for cl_kernel {
query(&self, idx: cl_uint, q: cl_kernel_arg_info) -> CLResult<Vec<MaybeUninit<u8>>>49     fn query(&self, idx: cl_uint, q: cl_kernel_arg_info) -> CLResult<Vec<MaybeUninit<u8>>> {
50         let kernel = Kernel::ref_from_raw(*self)?;
51 
52         // CL_INVALID_ARG_INDEX if arg_index is not a valid argument index.
53         if idx as usize >= kernel.kernel_info.args.len() {
54             return Err(CL_INVALID_ARG_INDEX);
55         }
56 
57         Ok(match *q {
58             CL_KERNEL_ARG_ACCESS_QUALIFIER => {
59                 cl_prop::<cl_kernel_arg_access_qualifier>(kernel.access_qualifier(idx))
60             }
61             CL_KERNEL_ARG_ADDRESS_QUALIFIER => {
62                 cl_prop::<cl_kernel_arg_address_qualifier>(kernel.address_qualifier(idx))
63             }
64             CL_KERNEL_ARG_NAME => cl_prop::<&str>(kernel.arg_name(idx)),
65             CL_KERNEL_ARG_TYPE_NAME => cl_prop::<&str>(kernel.arg_type_name(idx)),
66             CL_KERNEL_ARG_TYPE_QUALIFIER => {
67                 cl_prop::<cl_kernel_arg_type_qualifier>(kernel.type_qualifier(idx))
68             }
69             // CL_INVALID_VALUE if param_name is not one of the supported values
70             _ => return Err(CL_INVALID_VALUE),
71         })
72     }
73 }
74 
75 #[cl_info_entrypoint(clGetKernelWorkGroupInfo)]
76 impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
query( &self, dev: cl_device_id, q: cl_kernel_work_group_info, ) -> CLResult<Vec<MaybeUninit<u8>>>77     fn query(
78         &self,
79         dev: cl_device_id,
80         q: cl_kernel_work_group_info,
81     ) -> CLResult<Vec<MaybeUninit<u8>>> {
82         let kernel = Kernel::ref_from_raw(*self)?;
83 
84         // CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated with kernel.
85         let dev = if dev.is_null() {
86             if kernel.prog.devs.len() > 1 {
87                 return Err(CL_INVALID_DEVICE);
88             } else {
89                 kernel.prog.devs[0]
90             }
91         } else {
92             Device::ref_from_raw(dev)?
93         };
94 
95         // CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
96         if !kernel.prog.devs.contains(&dev) {
97             return Err(CL_INVALID_DEVICE);
98         }
99 
100         Ok(match *q {
101             CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size()),
102             CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::<cl_ulong>(kernel.local_mem_size(dev)),
103             CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
104                 cl_prop::<usize>(kernel.preferred_simd_size(dev))
105             }
106             CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::<cl_ulong>(kernel.priv_mem_size(dev)),
107             CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(dev)),
108             // CL_INVALID_VALUE if param_name is not one of the supported values
109             _ => return Err(CL_INVALID_VALUE),
110         })
111     }
112 }
113 
114 impl CLInfoObj<cl_kernel_sub_group_info, (cl_device_id, usize, *const c_void, usize)>
115     for cl_kernel
116 {
query( &self, (dev, input_value_size, input_value, output_value_size): ( cl_device_id, usize, *const c_void, usize, ), q: cl_program_build_info, ) -> CLResult<Vec<MaybeUninit<u8>>>117     fn query(
118         &self,
119         (dev, input_value_size, input_value, output_value_size): (
120             cl_device_id,
121             usize,
122             *const c_void,
123             usize,
124         ),
125         q: cl_program_build_info,
126     ) -> CLResult<Vec<MaybeUninit<u8>>> {
127         let kernel = Kernel::ref_from_raw(*self)?;
128 
129         // CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated
130         // with kernel.
131         let dev = if dev.is_null() {
132             if kernel.prog.devs.len() > 1 {
133                 return Err(CL_INVALID_DEVICE);
134             } else {
135                 kernel.prog.devs[0]
136             }
137         } else {
138             Device::ref_from_raw(dev)?
139         };
140 
141         // CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
142         if !kernel.prog.devs.contains(&dev) {
143             return Err(CL_INVALID_DEVICE);
144         }
145 
146         // CL_INVALID_OPERATION if device does not support subgroups.
147         if !dev.subgroups_supported() {
148             return Err(CL_INVALID_OPERATION);
149         }
150 
151         let usize_byte = mem::size_of::<usize>();
152         // first we have to convert the input to a proper thing
153         let input: &[usize] = match q {
154             CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE | CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
155                 // CL_INVALID_VALUE if param_name is CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
156                 // CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE or ... and the size in bytes specified by
157                 // input_value_size is not valid or if input_value is NULL.
158                 if ![usize_byte, 2 * usize_byte, 3 * usize_byte].contains(&input_value_size) {
159                     return Err(CL_INVALID_VALUE);
160                 }
161                 // SAFETY: we verified the size as best as possible, with the rest we trust the client
162                 unsafe { slice::from_raw_parts(input_value.cast(), input_value_size / usize_byte) }
163             }
164             CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
165                 // CL_INVALID_VALUE if param_name is ... CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT
166                 // and the size in bytes specified by input_value_size is not valid or if
167                 // input_value is NULL.
168                 if input_value_size != usize_byte || input_value.is_null() {
169                     return Err(CL_INVALID_VALUE);
170                 }
171                 // SAFETY: we trust the client here
172                 unsafe { slice::from_raw_parts(input_value.cast(), 1) }
173             }
174             _ => &[],
175         };
176 
177         Ok(match q {
178             CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
179                 cl_prop::<usize>(kernel.subgroups_for_block(dev, input))
180             }
181             CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE => {
182                 cl_prop::<usize>(kernel.subgroup_size_for_block(dev, input))
183             }
184             CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
185                 let subgroups = input[0];
186                 let mut res = vec![0; 3];
187 
188                 for subgroup_size in kernel.subgroup_sizes(dev) {
189                     let threads = subgroups * subgroup_size;
190 
191                     if threads > dev.max_threads_per_block() {
192                         continue;
193                     }
194 
195                     let block = [threads, 1, 1];
196                     let real_subgroups = kernel.subgroups_for_block(dev, &block);
197 
198                     if real_subgroups == subgroups {
199                         res = block.to_vec();
200                         break;
201                     }
202                 }
203 
204                 res.truncate(output_value_size / usize_byte);
205                 cl_prop::<Vec<usize>>(res)
206             }
207             CL_KERNEL_MAX_NUM_SUB_GROUPS => {
208                 let threads = kernel.max_threads_per_block(dev);
209                 let max_groups = dev.max_subgroups();
210 
211                 let mut result = 0;
212                 for sgs in kernel.subgroup_sizes(dev) {
213                     result = cmp::max(result, threads / sgs);
214                     result = cmp::min(result, max_groups as usize);
215                 }
216                 cl_prop::<usize>(result)
217             }
218             CL_KERNEL_COMPILE_NUM_SUB_GROUPS => cl_prop::<usize>(kernel.num_subgroups()),
219             CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL => cl_prop::<usize>(kernel.subgroup_size()),
220             // CL_INVALID_VALUE if param_name is not one of the supported values
221             _ => return Err(CL_INVALID_VALUE),
222         })
223     }
224 }
225 
226 const ZERO_ARR: [usize; 3] = [0; 3];
227 
228 /// # Safety
229 ///
230 /// This function is only safe when called on an array of `work_dim` length
kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -> &'a [usize]231 unsafe fn kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -> &'a [usize] {
232     if !arr.is_null() {
233         unsafe { slice::from_raw_parts(arr, work_dim as usize) }
234     } else {
235         &ZERO_ARR
236     }
237 }
238 
239 /// # Safety
240 ///
241 /// This function is only safe when called on an array of `work_dim` length
kernel_work_arr_mut<'a>(arr: *mut usize, work_dim: cl_uint) -> Option<&'a mut [usize]>242 unsafe fn kernel_work_arr_mut<'a>(arr: *mut usize, work_dim: cl_uint) -> Option<&'a mut [usize]> {
243     if !arr.is_null() {
244         unsafe { Some(slice::from_raw_parts_mut(arr, work_dim as usize)) }
245     } else {
246         None
247     }
248 }
249 
250 #[cl_entrypoint(clCreateKernel)]
create_kernel( program: cl_program, kernel_name: *const ::std::os::raw::c_char, ) -> CLResult<cl_kernel>251 fn create_kernel(
252     program: cl_program,
253     kernel_name: *const ::std::os::raw::c_char,
254 ) -> CLResult<cl_kernel> {
255     let p = Program::arc_from_raw(program)?;
256     let name = c_string_to_string(kernel_name);
257 
258     // CL_INVALID_VALUE if kernel_name is NULL.
259     if kernel_name.is_null() {
260         return Err(CL_INVALID_VALUE);
261     }
262 
263     let build = p.build_info();
264     // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built executable for program.
265     if build.kernels().is_empty() {
266         return Err(CL_INVALID_PROGRAM_EXECUTABLE);
267     }
268 
269     // CL_INVALID_KERNEL_NAME if kernel_name is not found in program.
270     if !build.kernels().contains(&name) {
271         return Err(CL_INVALID_KERNEL_NAME);
272     }
273 
274     // CL_INVALID_KERNEL_DEFINITION if the function definition for __kernel function given by
275     // kernel_name such as the number of arguments, the argument types are not the same for all
276     // devices for which the program executable has been built.
277     if !p.has_unique_kernel_signatures(&name) {
278         return Err(CL_INVALID_KERNEL_DEFINITION);
279     }
280 
281     Ok(Kernel::new(name, Arc::clone(&p), &build).into_cl())
282 }
283 
284 #[cl_entrypoint(clRetainKernel)]
retain_kernel(kernel: cl_kernel) -> CLResult<()>285 fn retain_kernel(kernel: cl_kernel) -> CLResult<()> {
286     Kernel::retain(kernel)
287 }
288 
289 #[cl_entrypoint(clReleaseKernel)]
release_kernel(kernel: cl_kernel) -> CLResult<()>290 fn release_kernel(kernel: cl_kernel) -> CLResult<()> {
291     Kernel::release(kernel)
292 }
293 
294 #[cl_entrypoint(clCreateKernelsInProgram)]
create_kernels_in_program( program: cl_program, num_kernels: cl_uint, kernels: *mut cl_kernel, num_kernels_ret: *mut cl_uint, ) -> CLResult<()>295 fn create_kernels_in_program(
296     program: cl_program,
297     num_kernels: cl_uint,
298     kernels: *mut cl_kernel,
299     num_kernels_ret: *mut cl_uint,
300 ) -> CLResult<()> {
301     let p = Program::arc_from_raw(program)?;
302     let build = p.build_info();
303 
304     // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built executable for any device in
305     // program.
306     if build.kernels().is_empty() {
307         return Err(CL_INVALID_PROGRAM_EXECUTABLE);
308     }
309 
310     // CL_INVALID_VALUE if kernels is not NULL and num_kernels is less than the number of kernels
311     // in program.
312     if !kernels.is_null() && build.kernels().len() > num_kernels as usize {
313         return Err(CL_INVALID_VALUE);
314     }
315 
316     let mut num_kernels = 0;
317     for name in build.kernels() {
318         // Kernel objects are not created for any __kernel functions in program that do not have the
319         // same function definition across all devices for which a program executable has been
320         // successfully built.
321         if !p.has_unique_kernel_signatures(name) {
322             continue;
323         }
324 
325         if !kernels.is_null() {
326             // we just assume the client isn't stupid
327             unsafe {
328                 kernels
329                     .add(num_kernels as usize)
330                     .write(Kernel::new(name.clone(), p.clone(), &build).into_cl());
331             }
332         }
333         num_kernels += 1;
334     }
335     num_kernels_ret.write_checked(num_kernels);
336     Ok(())
337 }
338 
339 #[cl_entrypoint(clSetKernelArg)]
set_kernel_arg( kernel: cl_kernel, arg_index: cl_uint, arg_size: usize, arg_value: *const ::std::os::raw::c_void, ) -> CLResult<()>340 fn set_kernel_arg(
341     kernel: cl_kernel,
342     arg_index: cl_uint,
343     arg_size: usize,
344     arg_value: *const ::std::os::raw::c_void,
345 ) -> CLResult<()> {
346     let k = Kernel::ref_from_raw(kernel)?;
347     let arg_index = arg_index as usize;
348 
349     // CL_INVALID_ARG_INDEX if arg_index is not a valid argument index.
350     if let Some(arg) = k.kernel_info.args.get(arg_index) {
351         // CL_INVALID_ARG_SIZE if arg_size does not match the size of the data type for an argument
352         // that is not a memory object or if the argument is a memory object and
353         // arg_size != sizeof(cl_mem) or if arg_size is zero and the argument is declared with the
354         // local qualifier or if the argument is a sampler and arg_size != sizeof(cl_sampler).
355         match arg.kind {
356             KernelArgType::MemLocal => {
357                 if arg_size == 0 {
358                     return Err(CL_INVALID_ARG_SIZE);
359                 }
360             }
361             KernelArgType::MemGlobal
362             | KernelArgType::MemConstant
363             | KernelArgType::Image
364             | KernelArgType::RWImage
365             | KernelArgType::Texture => {
366                 if arg_size != std::mem::size_of::<cl_mem>() {
367                     return Err(CL_INVALID_ARG_SIZE);
368                 }
369             }
370 
371             KernelArgType::Sampler => {
372                 if arg_size != std::mem::size_of::<cl_sampler>() {
373                     return Err(CL_INVALID_ARG_SIZE);
374                 }
375             }
376 
377             KernelArgType::Constant(size) => {
378                 if size as usize != arg_size {
379                     return Err(CL_INVALID_ARG_SIZE);
380                 }
381             }
382         }
383 
384         // CL_INVALID_ARG_VALUE if arg_value specified is not a valid value.
385         match arg.kind {
386             // If the argument is declared with the local qualifier, the arg_value entry must be
387             // NULL.
388             KernelArgType::MemLocal => {
389                 if !arg_value.is_null() {
390                     return Err(CL_INVALID_ARG_VALUE);
391                 }
392             }
393             // If the argument is of type sampler_t, the arg_value entry must be a pointer to the
394             // sampler object.
395             KernelArgType::Constant(_) | KernelArgType::Sampler => {
396                 if arg_value.is_null() {
397                     return Err(CL_INVALID_ARG_VALUE);
398                 }
399             }
400             _ => {}
401         };
402 
403         // let's create the arg now
404         let arg = unsafe {
405             if arg.dead {
406                 KernelArgValue::None
407             } else {
408                 match arg.kind {
409                     KernelArgType::Constant(_) => KernelArgValue::Constant(
410                         slice::from_raw_parts(arg_value.cast(), arg_size).to_vec(),
411                     ),
412                     KernelArgType::MemConstant | KernelArgType::MemGlobal => {
413                         let ptr: *const cl_mem = arg_value.cast();
414                         if ptr.is_null() || (*ptr).is_null() {
415                             KernelArgValue::None
416                         } else {
417                             KernelArgValue::Buffer(Buffer::arc_from_raw(*ptr)?)
418                         }
419                     }
420                     KernelArgType::MemLocal => KernelArgValue::LocalMem(arg_size),
421                     KernelArgType::Image | KernelArgType::RWImage | KernelArgType::Texture => {
422                         let img: *const cl_mem = arg_value.cast();
423                         KernelArgValue::Image(Image::arc_from_raw(*img)?)
424                     }
425                     KernelArgType::Sampler => {
426                         let ptr: *const cl_sampler = arg_value.cast();
427                         KernelArgValue::Sampler(Sampler::arc_from_raw(*ptr)?)
428                     }
429                 }
430             }
431         };
432         k.set_kernel_arg(arg_index, arg)
433     } else {
434         Err(CL_INVALID_ARG_INDEX)
435     }
436 
437     //• CL_INVALID_DEVICE_QUEUE for an argument declared to be of type queue_t when the specified arg_value is not a valid device queue object. This error code is missing before version 2.0.
438     //• CL_INVALID_ARG_VALUE if the argument is an image declared with the read_only qualifier and arg_value refers to an image object created with cl_mem_flags of CL_MEM_WRITE_ONLY or if the image argument is declared with the write_only qualifier and arg_value refers to an image object created with cl_mem_flags of CL_MEM_READ_ONLY.
439     //• CL_MAX_SIZE_RESTRICTION_EXCEEDED if the size in bytes of the memory object (if the argument is a memory object) or arg_size (if the argument is declared with local qualifier) exceeds a language- specified maximum size restriction for this argument, such as the MaxByteOffset SPIR-V decoration. This error code is missing before version 2.2.
440 }
441 
442 #[cl_entrypoint(clSetKernelArgSVMPointer)]
set_kernel_arg_svm_pointer( kernel: cl_kernel, arg_index: cl_uint, arg_value: *const ::std::os::raw::c_void, ) -> CLResult<()>443 fn set_kernel_arg_svm_pointer(
444     kernel: cl_kernel,
445     arg_index: cl_uint,
446     arg_value: *const ::std::os::raw::c_void,
447 ) -> CLResult<()> {
448     let kernel = Kernel::ref_from_raw(kernel)?;
449     let arg_index = arg_index as usize;
450     let arg_value = arg_value as usize;
451 
452     if !kernel.has_svm_devs() {
453         return Err(CL_INVALID_OPERATION);
454     }
455 
456     if let Some(arg) = kernel.kernel_info.args.get(arg_index) {
457         if !matches!(
458             arg.kind,
459             KernelArgType::MemConstant | KernelArgType::MemGlobal
460         ) {
461             return Err(CL_INVALID_ARG_INDEX);
462         }
463 
464         let arg_value = KernelArgValue::Constant(arg_value.to_ne_bytes().to_vec());
465         kernel.set_kernel_arg(arg_index, arg_value)
466     } else {
467         Err(CL_INVALID_ARG_INDEX)
468     }
469 
470     // CL_INVALID_ARG_VALUE if arg_value specified is not a valid value.
471 }
472 
473 #[cl_entrypoint(clSetKernelExecInfo)]
set_kernel_exec_info( kernel: cl_kernel, param_name: cl_kernel_exec_info, param_value_size: usize, param_value: *const ::std::os::raw::c_void, ) -> CLResult<()>474 fn set_kernel_exec_info(
475     kernel: cl_kernel,
476     param_name: cl_kernel_exec_info,
477     param_value_size: usize,
478     param_value: *const ::std::os::raw::c_void,
479 ) -> CLResult<()> {
480     let k = Kernel::ref_from_raw(kernel)?;
481 
482     // CL_INVALID_OPERATION if no devices in the context associated with kernel support SVM.
483     if !k.prog.devs.iter().any(|dev| dev.svm_supported()) {
484         return Err(CL_INVALID_OPERATION);
485     }
486 
487     // CL_INVALID_VALUE ... if param_value is NULL
488     if param_value.is_null() {
489         return Err(CL_INVALID_VALUE);
490     }
491 
492     // CL_INVALID_VALUE ... if the size specified by param_value_size is not valid.
493     match param_name {
494         CL_KERNEL_EXEC_INFO_SVM_PTRS | CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM => {
495             // it's a list of pointers
496             if param_value_size % mem::size_of::<*const c_void>() != 0 {
497                 return Err(CL_INVALID_VALUE);
498             }
499         }
500         CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM
501         | CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM => {
502             if param_value_size != mem::size_of::<cl_bool>() {
503                 return Err(CL_INVALID_VALUE);
504             }
505         }
506         // CL_INVALID_VALUE if param_name is not valid
507         _ => return Err(CL_INVALID_VALUE),
508     }
509 
510     Ok(())
511 
512     // CL_INVALID_OPERATION if param_name is CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM and param_value is CL_TRUE but no devices in context associated with kernel support fine-grain system SVM allocations.
513 }
514 
515 #[cl_entrypoint(clEnqueueNDRangeKernel)]
enqueue_ndrange_kernel( command_queue: cl_command_queue, kernel: cl_kernel, work_dim: cl_uint, global_work_offset: *const usize, global_work_size: *const usize, local_work_size: *const usize, num_events_in_wait_list: cl_uint, event_wait_list: *const cl_event, event: *mut cl_event, ) -> CLResult<()>516 fn enqueue_ndrange_kernel(
517     command_queue: cl_command_queue,
518     kernel: cl_kernel,
519     work_dim: cl_uint,
520     global_work_offset: *const usize,
521     global_work_size: *const usize,
522     local_work_size: *const usize,
523     num_events_in_wait_list: cl_uint,
524     event_wait_list: *const cl_event,
525     event: *mut cl_event,
526 ) -> CLResult<()> {
527     let q = Queue::arc_from_raw(command_queue)?;
528     let k = Kernel::arc_from_raw(kernel)?;
529     let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?;
530 
531     // CL_INVALID_CONTEXT if context associated with command_queue and kernel are not the same
532     if q.context != k.prog.context {
533         return Err(CL_INVALID_CONTEXT);
534     }
535 
536     // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available
537     // for device associated with command_queue.
538     if k.prog.status(q.device) != CL_BUILD_SUCCESS as cl_build_status {
539         return Err(CL_INVALID_PROGRAM_EXECUTABLE);
540     }
541 
542     // CL_INVALID_KERNEL_ARGS if the kernel argument values have not been specified.
543     if k.arg_values().iter().any(|v| v.is_none()) {
544         return Err(CL_INVALID_KERNEL_ARGS);
545     }
546 
547     // CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and
548     // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
549     if work_dim == 0 || work_dim > q.device.max_grid_dimensions() {
550         return Err(CL_INVALID_WORK_DIMENSION);
551     }
552 
553     // we assume the application gets it right and doesn't pass shorter arrays then actually needed.
554     let global_work_size = unsafe { kernel_work_arr_or_default(global_work_size, work_dim) };
555     let local_work_size = unsafe { kernel_work_arr_or_default(local_work_size, work_dim) };
556     let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) };
557 
558     let device_bits = q.device.address_bits();
559     let device_max = u64::MAX >> (u64::BITS - device_bits);
560 
561     let mut threads = 0;
562     for i in 0..work_dim as usize {
563         let lws = local_work_size[i];
564         let gws = global_work_size[i];
565         let gwo = global_work_offset[i];
566 
567         threads *= lws;
568 
569         // CL_INVALID_WORK_ITEM_SIZE if the number of work-items specified in any of
570         // local_work_size[0], … local_work_size[work_dim - 1] is greater than the corresponding
571         // values specified by
572         // CL_DEVICE_MAX_WORK_ITEM_SIZES[0], …, CL_DEVICE_MAX_WORK_ITEM_SIZES[work_dim - 1].
573         if lws > q.device.max_block_sizes()[i] {
574             return Err(CL_INVALID_WORK_ITEM_SIZE);
575         }
576 
577         // CL_INVALID_WORK_GROUP_SIZE if the work-group size must be uniform and the
578         // local_work_size is not NULL, [...] if the global_work_size is not evenly divisible by
579         // the local_work_size.
580         if lws != 0 && gws % lws != 0 {
581             return Err(CL_INVALID_WORK_GROUP_SIZE);
582         }
583 
584         // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not match the
585         // required work-group size for kernel in the program source.
586         if lws != 0 && k.work_group_size()[i] != 0 && lws != k.work_group_size()[i] {
587             return Err(CL_INVALID_WORK_GROUP_SIZE);
588         }
589 
590         // CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size[0], …
591         // global_work_size[work_dim - 1] exceed the maximum value representable by size_t on
592         // the device on which the kernel-instance will be enqueued.
593         if gws as u64 > device_max {
594             return Err(CL_INVALID_GLOBAL_WORK_SIZE);
595         }
596 
597         // CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size + the
598         // corresponding values in global_work_offset for any dimensions is greater than the
599         // maximum value representable by size t on the device on which the kernel-instance
600         // will be enqueued
601         if u64::checked_add(gws as u64, gwo as u64)
602             .filter(|&x| x <= device_max)
603             .is_none()
604         {
605             return Err(CL_INVALID_GLOBAL_OFFSET);
606         }
607     }
608 
609     // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the total number of work-items
610     // in the work-group computed as local_work_size[0] × … local_work_size[work_dim - 1] is greater
611     // than the value specified by CL_KERNEL_WORK_GROUP_SIZE in the Kernel Object Device Queries
612     // table.
613     if threads != 0 && threads > k.max_threads_per_block(q.device) {
614         return Err(CL_INVALID_WORK_GROUP_SIZE);
615     }
616 
617     // If global_work_size is NULL, or the value in any passed dimension is 0 then the kernel
618     // command will trivially succeed after its event dependencies are satisfied and subsequently
619     // update its completion event.
620     let cb: EventSig = if global_work_size.contains(&0) {
621         Box::new(|_, _| Ok(()))
622     } else {
623         k.launch(
624             &q,
625             work_dim,
626             local_work_size,
627             global_work_size,
628             global_work_offset,
629         )?
630     };
631 
632     create_and_queue(q, CL_COMMAND_NDRANGE_KERNEL, evs, event, false, cb)
633 
634     //• CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not consistent with the required number of sub-groups for kernel in the program source.
635     //• CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is specified as the value for an argument that is a buffer object and the offset specified when the sub-buffer object is created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN value for device associated with queue. This error code
636     //• CL_INVALID_IMAGE_SIZE if an image object is specified as an argument value and the image dimensions (image width, height, specified or compute row and/or slice pitch) are not supported by device associated with queue.
637     //• CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is specified as an argument value and the image format (image channel order and data type) is not supported by device associated with queue.
638     //• CL_OUT_OF_RESOURCES if there is a failure to queue the execution instance of kernel on the command-queue because of insufficient resources needed to execute the kernel. For example, the explicitly specified local_work_size causes a failure to execute the kernel because of insufficient resources such as registers or local memory. Another example would be the number of read-only image args used in kernel exceed the CL_DEVICE_MAX_READ_IMAGE_ARGS value for device or the number of write-only and read-write image args used in kernel exceed the CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS value for device or the number of samplers used in kernel exceed CL_DEVICE_MAX_SAMPLERS for device.
639     //• CL_MEM_OBJECT_ALLOCATION_FAILURE if there is a failure to allocate memory for data store associated with image or buffer objects specified as arguments to kernel.
640     //• CL_INVALID_OPERATION if SVM pointers are passed as arguments to a kernel and the device does not support SVM or if system pointers are passed as arguments to a kernel and/or stored inside SVM allocations passed as kernel arguments and the device does not support fine grain system SVM allocations.
641 }
642 
643 #[cl_entrypoint(clEnqueueTask)]
enqueue_task( command_queue: cl_command_queue, kernel: cl_kernel, num_events_in_wait_list: cl_uint, event_wait_list: *const cl_event, event: *mut cl_event, ) -> CLResult<()>644 fn enqueue_task(
645     command_queue: cl_command_queue,
646     kernel: cl_kernel,
647     num_events_in_wait_list: cl_uint,
648     event_wait_list: *const cl_event,
649     event: *mut cl_event,
650 ) -> CLResult<()> {
651     // clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with work_dim set to 1,
652     // global_work_offset set to NULL, global_work_size[0] set to 1, and local_work_size[0] set to
653     // 1.
654     enqueue_ndrange_kernel(
655         command_queue,
656         kernel,
657         1,
658         ptr::null(),
659         [1, 1, 1].as_ptr(),
660         [1, 0, 0].as_ptr(),
661         num_events_in_wait_list,
662         event_wait_list,
663         event,
664     )
665 }
666 
667 #[cl_entrypoint(clCloneKernel)]
clone_kernel(source_kernel: cl_kernel) -> CLResult<cl_kernel>668 fn clone_kernel(source_kernel: cl_kernel) -> CLResult<cl_kernel> {
669     let k = Kernel::ref_from_raw(source_kernel)?;
670     Ok(Arc::new(k.clone()).into_cl())
671 }
672 
673 #[cl_entrypoint(clGetKernelSuggestedLocalWorkSizeKHR)]
get_kernel_suggested_local_work_size_khr( command_queue: cl_command_queue, kernel: cl_kernel, work_dim: cl_uint, global_work_offset: *const usize, global_work_size: *const usize, suggested_local_work_size: *mut usize, ) -> CLResult<()>674 fn get_kernel_suggested_local_work_size_khr(
675     command_queue: cl_command_queue,
676     kernel: cl_kernel,
677     work_dim: cl_uint,
678     global_work_offset: *const usize,
679     global_work_size: *const usize,
680     suggested_local_work_size: *mut usize,
681 ) -> CLResult<()> {
682     // CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified in
683     // global_work_size are 0.
684     if global_work_size.is_null() {
685         return Err(CL_INVALID_GLOBAL_WORK_SIZE);
686     }
687 
688     if global_work_offset.is_null() {
689         return Err(CL_INVALID_GLOBAL_OFFSET);
690     }
691 
692     // CL_INVALID_VALUE if suggested_local_work_size is NULL.
693     if suggested_local_work_size.is_null() {
694         return Err(CL_INVALID_VALUE);
695     }
696 
697     // CL_INVALID_COMMAND_QUEUE if command_queue is not a valid host command-queue.
698     let queue = Queue::ref_from_raw(command_queue)?;
699 
700     // CL_INVALID_KERNEL if kernel is not a valid kernel object.
701     let kernel = Kernel::ref_from_raw(kernel)?;
702 
703     // CL_INVALID_CONTEXT if the context associated with kernel is not the same as the context
704     // associated with command_queue.
705     if queue.context != kernel.prog.context {
706         return Err(CL_INVALID_CONTEXT);
707     }
708 
709     // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available
710     // for kernel for the device associated with command_queue.
711     if kernel.prog.status(queue.device) != CL_BUILD_SUCCESS as cl_build_status {
712         return Err(CL_INVALID_PROGRAM_EXECUTABLE);
713     }
714 
715     // CL_INVALID_KERNEL_ARGS if all argument values for kernel have not been set.
716     if kernel.arg_values().iter().any(|v| v.is_none()) {
717         return Err(CL_INVALID_KERNEL_ARGS);
718     }
719 
720     // CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and
721     // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
722     if work_dim == 0 || work_dim > queue.device.max_grid_dimensions() {
723         return Err(CL_INVALID_WORK_DIMENSION);
724     }
725 
726     let mut global_work_size =
727         unsafe { kernel_work_arr_or_default(global_work_size, work_dim).to_vec() };
728 
729     let suggested_local_work_size = unsafe {
730         kernel_work_arr_mut(suggested_local_work_size, work_dim).ok_or(CL_INVALID_VALUE)?
731     };
732 
733     let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) };
734 
735     let device_bits = queue.device.address_bits();
736     let device_max = u64::MAX >> (u64::BITS - device_bits);
737     for i in 0..work_dim as usize {
738         let gws = global_work_size[i];
739         let gwo = global_work_offset[i];
740 
741         // CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified
742         // in global_work_size are 0.
743         if gws == 0 {
744             return Err(CL_INVALID_GLOBAL_WORK_SIZE);
745         }
746         // CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size exceed the
747         // maximum value representable by size_t on the device associated with command_queue.
748         if gws as u64 > device_max {
749             return Err(CL_INVALID_GLOBAL_WORK_SIZE);
750         }
751         // CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size plus the
752         // corresponding value in global_work_offset for dimension exceeds the maximum value
753         // representable by size_t on the device associated with command_queue.
754         if u64::checked_add(gws as u64, gwo as u64)
755             .filter(|&x| x <= device_max)
756             .is_none()
757         {
758             return Err(CL_INVALID_GLOBAL_OFFSET);
759         }
760     }
761 
762     kernel.suggest_local_size(
763         queue.device,
764         work_dim as usize,
765         &mut global_work_size,
766         suggested_local_work_size,
767     );
768 
769     Ok(())
770 
771     // CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is set as an argument to kernel and the offset specified when the sub-buffer object was created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN for the device associated with command_queue.
772     // CL_INVALID_IMAGE_SIZE if an image object is set as an argument to kernel and the image dimensions are not supported by device associated with command_queue.
773     // CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is set as an argument to kernel and the image format is not supported by the device associated with command_queue.
774     // CL_INVALID_OPERATION if an SVM pointer is set as an argument to kernel and the device associated with command_queue does not support SVM or the required SVM capabilities for the SVM pointer.
775     // CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL implementation on the device.
776     // CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
777 }
778