xref: /aosp_15_r20/external/mesa3d/src/nouveau/compiler/nak/hw_runner.rs (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3 
4 use compiler::bindings::*;
5 use nak_bindings::*;
6 use nv_push_rs::Push as NvPush;
7 use nvidia_headers::classes::cla0c0::mthd as cla0c0;
8 use nvidia_headers::classes::clb1c0::mthd as clb1c0;
9 use nvidia_headers::classes::clb1c0::MAXWELL_COMPUTE_B;
10 use nvidia_headers::classes::clc3c0::mthd as clc3c0;
11 use nvidia_headers::classes::clc3c0::VOLTA_COMPUTE_A;
12 use nvidia_headers::classes::clc6c0::mthd as clc6c0;
13 use nvidia_headers::classes::clc6c0::AMPERE_COMPUTE_A;
14 
15 use std::io;
16 use std::ptr;
17 use std::ptr::NonNull;
18 use std::sync::atomic::{AtomicU64, Ordering};
19 use std::sync::Mutex;
20 
is_nvidia_device(dev: drmDevicePtr) -> bool21 unsafe fn is_nvidia_device(dev: drmDevicePtr) -> bool {
22     match (*dev).bustype as u32 {
23         DRM_BUS_PCI => {
24             let pci = &*(*dev).deviceinfo.pci;
25             pci.vendor_id == (NVIDIA_VENDOR_ID as u16)
26         }
27         _ => false,
28     }
29 }
30 
31 #[repr(C)]
32 pub struct CB0 {
33     pub data_addr_lo: u32,
34     pub data_addr_hi: u32,
35     pub data_stride: u32,
36     pub invocations: u32,
37 }
38 
39 struct BO<'a> {
40     run: &'a Runner,
41     bo: NonNull<nouveau_ws_bo>,
42     pub addr: u64,
43     pub map: *mut std::os::raw::c_void,
44 }
45 
46 impl<'a> BO<'a> {
new(run: &'a Runner, size: u64) -> io::Result<BO<'a>>47     fn new(run: &'a Runner, size: u64) -> io::Result<BO<'a>> {
48         let size = size.next_multiple_of(4096);
49 
50         let mut map: *mut std::os::raw::c_void = std::ptr::null_mut();
51         let bo = unsafe {
52             nouveau_ws_bo_new_mapped(
53                 run.dev.as_ptr(),
54                 size,
55                 0, // align
56                 NOUVEAU_WS_BO_GART,
57                 NOUVEAU_WS_BO_RDWR,
58                 ptr::from_mut(&mut map),
59             )
60         };
61         let Some(bo) = NonNull::new(bo) else {
62             return Err(io::Error::last_os_error());
63         };
64         assert!(!map.is_null());
65 
66         let addr = run.next_addr.fetch_add(size, Ordering::Relaxed);
67         assert!(addr % 4096 == 0);
68 
69         unsafe {
70             nouveau_ws_bo_bind_vma(
71                 run.dev.as_ptr(),
72                 bo.as_ptr(),
73                 addr,
74                 size,
75                 0, // bo_offset
76                 0, // pte_kind
77             );
78         }
79 
80         Ok(BO { run, bo, addr, map })
81     }
82 }
83 
84 impl Drop for BO<'_> {
drop(&mut self)85     fn drop(&mut self) {
86         unsafe {
87             nouveau_ws_bo_unbind_vma(
88                 self.run.dev.as_ptr(),
89                 self.addr,
90                 self.bo.as_ref().size,
91             );
92             nouveau_ws_bo_destroy(self.bo.as_ptr());
93         }
94     }
95 }
96 
97 pub struct Runner {
98     dev: NonNull<nouveau_ws_device>,
99     ctx: NonNull<nouveau_ws_context>,
100     syncobj: u32,
101     sync_value: Mutex<u64>,
102     next_addr: AtomicU64,
103 }
104 
105 impl<'a> Runner {
new(dev_id: Option<usize>) -> Runner106     pub fn new(dev_id: Option<usize>) -> Runner {
107         unsafe {
108             let mut drm_devices: [drmDevicePtr; 16] = std::mem::zeroed();
109             let num_drm_devices = drmGetDevices(
110                 drm_devices.as_mut_ptr(),
111                 drm_devices.len().try_into().unwrap(),
112             );
113 
114             assert!(num_drm_devices >= 0, "Failed to enumerate DRM devices");
115             let num_drm_devices: usize = num_drm_devices.try_into().unwrap();
116 
117             let drm_dev = if let Some(dev_id) = dev_id {
118                 assert!(dev_id < num_drm_devices, "Unknown device {dev_id}");
119                 assert!(
120                     is_nvidia_device(drm_devices[dev_id]),
121                     "Device {dev_id} is not an NVIDIA device",
122                 );
123                 drm_devices[dev_id]
124             } else {
125                 *drm_devices
126                     .iter()
127                     .find(|dev| is_nvidia_device(**dev))
128                     .expect("Failed to find an NVIDIA device")
129             };
130 
131             let dev = nouveau_ws_device_new(drm_dev);
132             let dev =
133                 NonNull::new(dev).expect("Failed to create nouveau device");
134 
135             drmFreeDevices(
136                 drm_devices.as_mut_ptr(),
137                 num_drm_devices.try_into().unwrap(),
138             );
139 
140             let mut ctx: *mut nouveau_ws_context = std::ptr::null_mut();
141             let err = nouveau_ws_context_create(
142                 dev.as_ptr(),
143                 NOUVEAU_WS_ENGINE_COMPUTE,
144                 &mut ctx,
145             );
146             assert!(err == 0, "Failed to create nouveau context");
147             let ctx = NonNull::new(ctx).unwrap();
148 
149             let mut syncobj = 0_u32;
150             let err = drmSyncobjCreate(dev.as_ref().fd, 0, &mut syncobj);
151             assert!(err == 0, "Failed to create syncobj");
152 
153             Runner {
154                 dev,
155                 ctx,
156                 syncobj,
157                 sync_value: Mutex::new(0),
158                 next_addr: AtomicU64::new(1 << 16),
159             }
160         }
161     }
162 
dev_info(&self) -> &nv_device_info163     pub fn dev_info(&self) -> &nv_device_info {
164         unsafe { &self.dev.as_ref().info }
165     }
166 
exec(&self, addr: u64, len: u16) -> io::Result<()>167     fn exec(&self, addr: u64, len: u16) -> io::Result<()> {
168         let sync_value = unsafe {
169             let mut sync_value = self.sync_value.lock().unwrap();
170             *sync_value += 1;
171 
172             let push = drm_nouveau_exec_push {
173                 va: addr,
174                 va_len: len.into(),
175                 flags: 0,
176             };
177             let sig = drm_nouveau_sync {
178                 flags: DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ,
179                 handle: self.syncobj,
180                 timeline_value: *sync_value,
181             };
182             let exec = drm_nouveau_exec {
183                 channel: self.ctx.as_ref().channel as u32,
184                 wait_count: 0,
185                 wait_ptr: 0,
186                 push_count: 1,
187                 push_ptr: &push as *const _ as u64,
188                 sig_count: 1,
189                 sig_ptr: &sig as *const _ as u64,
190             };
191             let err = drmIoctl(
192                 self.dev.as_ref().fd,
193                 DRM_RS_IOCTL_NOUVEAU_EXEC,
194                 &exec as *const _ as *mut std::os::raw::c_void,
195             );
196             if err != 0 {
197                 return Err(io::Error::last_os_error());
198             }
199             *sync_value
200         };
201         // The close of this unsafe { } drops the lock
202 
203         unsafe {
204             let err = drmSyncobjTimelineWait(
205                 self.dev.as_ref().fd,
206                 &self.syncobj as *const _ as *mut _,
207                 &sync_value as *const _ as *mut _,
208                 1,        // num_handles
209                 i64::MAX, // timeout_nsec
210                 DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
211                 std::ptr::null_mut(),
212             );
213             if err != 0 {
214                 return Err(io::Error::last_os_error());
215             }
216 
217             // Exec again to check for errors
218             let mut exec = drm_nouveau_exec {
219                 channel: self.ctx.as_ref().channel as u32,
220                 wait_count: 0,
221                 wait_ptr: 0,
222                 push_count: 0,
223                 push_ptr: 0,
224                 sig_count: 0,
225                 sig_ptr: 0,
226             };
227             let err = drmIoctl(
228                 self.dev.as_ref().fd,
229                 DRM_RS_IOCTL_NOUVEAU_EXEC,
230                 ptr::from_mut(&mut exec).cast(),
231             );
232             if err != 0 {
233                 return Err(io::Error::last_os_error());
234             }
235         }
236 
237         Ok(())
238     }
239 
run_raw( &self, shader: &nak_shader_bin, invocations: u32, data_stride: u32, data: *mut std::os::raw::c_void, data_size: usize, ) -> io::Result<()>240     pub unsafe fn run_raw(
241         &self,
242         shader: &nak_shader_bin,
243         invocations: u32,
244         data_stride: u32,
245         data: *mut std::os::raw::c_void,
246         data_size: usize,
247     ) -> io::Result<()> {
248         assert!(shader.info.stage == MESA_SHADER_COMPUTE);
249         let cs_info = &shader.info.__bindgen_anon_1.cs;
250         assert!(cs_info.local_size[1] == 1 && cs_info.local_size[2] == 1);
251         let local_size = cs_info.local_size[0];
252 
253         // Compute the needed size of the buffer
254         let mut size = 0_usize;
255 
256         const MAX_PUSH_DW: usize = 256;
257         let push_offset = size;
258         size = push_offset + 4 * MAX_PUSH_DW;
259 
260         const QMD_SIZE: usize = 64 * 4;
261         let qmd_offset = size.next_multiple_of(0x100);
262         size = qmd_offset + 4 * QMD_SIZE;
263 
264         let shader_offset = size.next_multiple_of(0x80);
265         size = shader_offset + usize::try_from(shader.code_size).unwrap();
266 
267         let cb0_offset = size.next_multiple_of(256);
268         size = cb0_offset + std::mem::size_of::<CB0>();
269 
270         let data_offset = size.next_multiple_of(256);
271         size = data_offset + data_size;
272 
273         let bo = BO::new(self, size.try_into().unwrap())?;
274 
275         // Copy the data from the caller into our BO
276         let data_addr = bo.addr + u64::try_from(data_offset).unwrap();
277         let data_map = bo.map.byte_offset(data_offset.try_into().unwrap());
278         std::ptr::copy(data, data_map, data_size);
279 
280         // Fill out cb0
281         let cb0_addr = bo.addr + u64::try_from(cb0_offset).unwrap();
282         let cb0_map = bo.map.byte_offset(cb0_offset.try_into().unwrap());
283         cb0_map.cast::<CB0>().write(CB0 {
284             data_addr_lo: data_addr as u32,
285             data_addr_hi: (data_addr >> 32) as u32,
286             data_stride,
287             invocations,
288         });
289 
290         // Upload the shader
291         let shader_addr = bo.addr + u64::try_from(shader_offset).unwrap();
292         let shader_map = bo.map.byte_offset(shader_offset.try_into().unwrap());
293         std::ptr::copy(
294             shader.code,
295             shader_map,
296             shader.code_size.try_into().unwrap(),
297         );
298 
299         // Populate and upload the QMD
300         let mut qmd_cbufs: [nak_qmd_cbuf; 8] = unsafe { std::mem::zeroed() };
301         qmd_cbufs[0] = nak_qmd_cbuf {
302             index: 0,
303             size: std::mem::size_of::<CB0>()
304                 .next_multiple_of(256)
305                 .try_into()
306                 .unwrap(),
307             addr: cb0_addr,
308         };
309         let qmd_info = nak_qmd_info {
310             // Pre-Volta, we set the program region to the start of the bo
311             addr: if self.dev_info().cls_compute < VOLTA_COMPUTE_A {
312                 shader_offset.try_into().unwrap()
313             } else {
314                 shader_addr
315             },
316             smem_size: 0,
317             smem_max: 48 * 1024,
318             global_size: [invocations.div_ceil(local_size.into()), 1, 1],
319             num_cbufs: 1,
320             cbufs: qmd_cbufs,
321         };
322 
323         let qmd_addr = bo.addr + u64::try_from(qmd_offset).unwrap();
324         let qmd_map = bo.map.byte_offset(qmd_offset.try_into().unwrap());
325         nak_fill_qmd(
326             self.dev_info(),
327             &shader.info,
328             &qmd_info,
329             qmd_map,
330             QMD_SIZE,
331         );
332 
333         // Fill out the pushbuf
334         let mut p = NvPush::new();
335 
336         p.push_method(cla0c0::SetObject {
337             class_id: self.dev_info().cls_compute.into(),
338             engine_id: 0,
339         });
340         if self.dev_info().cls_compute < VOLTA_COMPUTE_A {
341             p.push_method(cla0c0::SetProgramRegionA {
342                 address_upper: (bo.addr >> 32) as u32,
343             });
344             p.push_method(cla0c0::SetProgramRegionB {
345                 address_lower: bo.addr as u32,
346             });
347         }
348 
349         let smem_base_addr = 0xfe000000_u32;
350         let lmem_base_addr = 0xff000000_u32;
351         if self.dev_info().cls_compute >= VOLTA_COMPUTE_A {
352             p.push_method(clc3c0::SetShaderSharedMemoryWindowA {
353                 base_address_upper: 0,
354             });
355             p.push_method(clc3c0::SetShaderSharedMemoryWindowB {
356                 base_address: smem_base_addr,
357             });
358 
359             p.push_method(clc3c0::SetShaderLocalMemoryWindowA {
360                 base_address_upper: 0,
361             });
362             p.push_method(clc3c0::SetShaderLocalMemoryWindowB {
363                 base_address: lmem_base_addr,
364             });
365         } else {
366             p.push_method(cla0c0::SetShaderSharedMemoryWindow {
367                 base_address: smem_base_addr,
368             });
369             p.push_method(cla0c0::SetShaderLocalMemoryWindow {
370                 base_address: lmem_base_addr,
371             });
372         }
373 
374         if self.dev_info().cls_compute >= MAXWELL_COMPUTE_B {
375             p.push_method(clb1c0::InvalidateSkedCaches { v: 0 });
376         }
377 
378         p.push_method(cla0c0::SendPcasA {
379             qmd_address_shifted8: (qmd_addr >> 8) as u32,
380         });
381         if self.dev_info().cls_compute >= AMPERE_COMPUTE_A {
382             p.push_method(clc6c0::SendSignalingPcas2B {
383                 pcas_action: clc6c0::SendSignalingPcas2BPcasAction::InvalidateCopySchedule,
384             });
385         } else {
386             p.push_method(cla0c0::SendSignalingPcasB {
387                 invalidate: true,
388                 schedule: true,
389             });
390         }
391 
392         let push_addr = bo.addr + u64::try_from(push_offset).unwrap();
393         let push_map = bo.map.byte_offset(push_offset.try_into().unwrap());
394         std::ptr::copy(p.as_ptr(), push_map.cast(), p.len());
395 
396         let res = self.exec(push_addr, (p.len() * 4).try_into().unwrap());
397 
398         // Always copy the data back to the caller, even if exec fails
399         let data_map = bo.map.byte_offset(data_offset.try_into().unwrap());
400         std::ptr::copy(data_map, data, data_size);
401 
402         res
403     }
404 
run<T>( &self, shader: &nak_shader_bin, data: &mut [T], ) -> io::Result<()>405     pub fn run<T>(
406         &self,
407         shader: &nak_shader_bin,
408         data: &mut [T],
409     ) -> io::Result<()> {
410         unsafe {
411             let stride = std::mem::size_of::<T>();
412             self.run_raw(
413                 shader,
414                 data.len().try_into().unwrap(),
415                 stride.try_into().unwrap(),
416                 data.as_mut_ptr().cast(),
417                 data.len() * stride,
418             )
419         }
420     }
421 }
422 
423 unsafe impl Sync for Runner {}
424 unsafe impl Send for Runner {}
425