1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3
4 use compiler::bindings::*;
5 use nak_bindings::*;
6 use nv_push_rs::Push as NvPush;
7 use nvidia_headers::classes::cla0c0::mthd as cla0c0;
8 use nvidia_headers::classes::clb1c0::mthd as clb1c0;
9 use nvidia_headers::classes::clb1c0::MAXWELL_COMPUTE_B;
10 use nvidia_headers::classes::clc3c0::mthd as clc3c0;
11 use nvidia_headers::classes::clc3c0::VOLTA_COMPUTE_A;
12 use nvidia_headers::classes::clc6c0::mthd as clc6c0;
13 use nvidia_headers::classes::clc6c0::AMPERE_COMPUTE_A;
14
15 use std::io;
16 use std::ptr;
17 use std::ptr::NonNull;
18 use std::sync::atomic::{AtomicU64, Ordering};
19 use std::sync::Mutex;
20
is_nvidia_device(dev: drmDevicePtr) -> bool21 unsafe fn is_nvidia_device(dev: drmDevicePtr) -> bool {
22 match (*dev).bustype as u32 {
23 DRM_BUS_PCI => {
24 let pci = &*(*dev).deviceinfo.pci;
25 pci.vendor_id == (NVIDIA_VENDOR_ID as u16)
26 }
27 _ => false,
28 }
29 }
30
31 #[repr(C)]
32 pub struct CB0 {
33 pub data_addr_lo: u32,
34 pub data_addr_hi: u32,
35 pub data_stride: u32,
36 pub invocations: u32,
37 }
38
39 struct BO<'a> {
40 run: &'a Runner,
41 bo: NonNull<nouveau_ws_bo>,
42 pub addr: u64,
43 pub map: *mut std::os::raw::c_void,
44 }
45
46 impl<'a> BO<'a> {
new(run: &'a Runner, size: u64) -> io::Result<BO<'a>>47 fn new(run: &'a Runner, size: u64) -> io::Result<BO<'a>> {
48 let size = size.next_multiple_of(4096);
49
50 let mut map: *mut std::os::raw::c_void = std::ptr::null_mut();
51 let bo = unsafe {
52 nouveau_ws_bo_new_mapped(
53 run.dev.as_ptr(),
54 size,
55 0, // align
56 NOUVEAU_WS_BO_GART,
57 NOUVEAU_WS_BO_RDWR,
58 ptr::from_mut(&mut map),
59 )
60 };
61 let Some(bo) = NonNull::new(bo) else {
62 return Err(io::Error::last_os_error());
63 };
64 assert!(!map.is_null());
65
66 let addr = run.next_addr.fetch_add(size, Ordering::Relaxed);
67 assert!(addr % 4096 == 0);
68
69 unsafe {
70 nouveau_ws_bo_bind_vma(
71 run.dev.as_ptr(),
72 bo.as_ptr(),
73 addr,
74 size,
75 0, // bo_offset
76 0, // pte_kind
77 );
78 }
79
80 Ok(BO { run, bo, addr, map })
81 }
82 }
83
84 impl Drop for BO<'_> {
drop(&mut self)85 fn drop(&mut self) {
86 unsafe {
87 nouveau_ws_bo_unbind_vma(
88 self.run.dev.as_ptr(),
89 self.addr,
90 self.bo.as_ref().size,
91 );
92 nouveau_ws_bo_destroy(self.bo.as_ptr());
93 }
94 }
95 }
96
97 pub struct Runner {
98 dev: NonNull<nouveau_ws_device>,
99 ctx: NonNull<nouveau_ws_context>,
100 syncobj: u32,
101 sync_value: Mutex<u64>,
102 next_addr: AtomicU64,
103 }
104
105 impl<'a> Runner {
new(dev_id: Option<usize>) -> Runner106 pub fn new(dev_id: Option<usize>) -> Runner {
107 unsafe {
108 let mut drm_devices: [drmDevicePtr; 16] = std::mem::zeroed();
109 let num_drm_devices = drmGetDevices(
110 drm_devices.as_mut_ptr(),
111 drm_devices.len().try_into().unwrap(),
112 );
113
114 assert!(num_drm_devices >= 0, "Failed to enumerate DRM devices");
115 let num_drm_devices: usize = num_drm_devices.try_into().unwrap();
116
117 let drm_dev = if let Some(dev_id) = dev_id {
118 assert!(dev_id < num_drm_devices, "Unknown device {dev_id}");
119 assert!(
120 is_nvidia_device(drm_devices[dev_id]),
121 "Device {dev_id} is not an NVIDIA device",
122 );
123 drm_devices[dev_id]
124 } else {
125 *drm_devices
126 .iter()
127 .find(|dev| is_nvidia_device(**dev))
128 .expect("Failed to find an NVIDIA device")
129 };
130
131 let dev = nouveau_ws_device_new(drm_dev);
132 let dev =
133 NonNull::new(dev).expect("Failed to create nouveau device");
134
135 drmFreeDevices(
136 drm_devices.as_mut_ptr(),
137 num_drm_devices.try_into().unwrap(),
138 );
139
140 let mut ctx: *mut nouveau_ws_context = std::ptr::null_mut();
141 let err = nouveau_ws_context_create(
142 dev.as_ptr(),
143 NOUVEAU_WS_ENGINE_COMPUTE,
144 &mut ctx,
145 );
146 assert!(err == 0, "Failed to create nouveau context");
147 let ctx = NonNull::new(ctx).unwrap();
148
149 let mut syncobj = 0_u32;
150 let err = drmSyncobjCreate(dev.as_ref().fd, 0, &mut syncobj);
151 assert!(err == 0, "Failed to create syncobj");
152
153 Runner {
154 dev,
155 ctx,
156 syncobj,
157 sync_value: Mutex::new(0),
158 next_addr: AtomicU64::new(1 << 16),
159 }
160 }
161 }
162
dev_info(&self) -> &nv_device_info163 pub fn dev_info(&self) -> &nv_device_info {
164 unsafe { &self.dev.as_ref().info }
165 }
166
exec(&self, addr: u64, len: u16) -> io::Result<()>167 fn exec(&self, addr: u64, len: u16) -> io::Result<()> {
168 let sync_value = unsafe {
169 let mut sync_value = self.sync_value.lock().unwrap();
170 *sync_value += 1;
171
172 let push = drm_nouveau_exec_push {
173 va: addr,
174 va_len: len.into(),
175 flags: 0,
176 };
177 let sig = drm_nouveau_sync {
178 flags: DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ,
179 handle: self.syncobj,
180 timeline_value: *sync_value,
181 };
182 let exec = drm_nouveau_exec {
183 channel: self.ctx.as_ref().channel as u32,
184 wait_count: 0,
185 wait_ptr: 0,
186 push_count: 1,
187 push_ptr: &push as *const _ as u64,
188 sig_count: 1,
189 sig_ptr: &sig as *const _ as u64,
190 };
191 let err = drmIoctl(
192 self.dev.as_ref().fd,
193 DRM_RS_IOCTL_NOUVEAU_EXEC,
194 &exec as *const _ as *mut std::os::raw::c_void,
195 );
196 if err != 0 {
197 return Err(io::Error::last_os_error());
198 }
199 *sync_value
200 };
201 // The close of this unsafe { } drops the lock
202
203 unsafe {
204 let err = drmSyncobjTimelineWait(
205 self.dev.as_ref().fd,
206 &self.syncobj as *const _ as *mut _,
207 &sync_value as *const _ as *mut _,
208 1, // num_handles
209 i64::MAX, // timeout_nsec
210 DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
211 std::ptr::null_mut(),
212 );
213 if err != 0 {
214 return Err(io::Error::last_os_error());
215 }
216
217 // Exec again to check for errors
218 let mut exec = drm_nouveau_exec {
219 channel: self.ctx.as_ref().channel as u32,
220 wait_count: 0,
221 wait_ptr: 0,
222 push_count: 0,
223 push_ptr: 0,
224 sig_count: 0,
225 sig_ptr: 0,
226 };
227 let err = drmIoctl(
228 self.dev.as_ref().fd,
229 DRM_RS_IOCTL_NOUVEAU_EXEC,
230 ptr::from_mut(&mut exec).cast(),
231 );
232 if err != 0 {
233 return Err(io::Error::last_os_error());
234 }
235 }
236
237 Ok(())
238 }
239
run_raw( &self, shader: &nak_shader_bin, invocations: u32, data_stride: u32, data: *mut std::os::raw::c_void, data_size: usize, ) -> io::Result<()>240 pub unsafe fn run_raw(
241 &self,
242 shader: &nak_shader_bin,
243 invocations: u32,
244 data_stride: u32,
245 data: *mut std::os::raw::c_void,
246 data_size: usize,
247 ) -> io::Result<()> {
248 assert!(shader.info.stage == MESA_SHADER_COMPUTE);
249 let cs_info = &shader.info.__bindgen_anon_1.cs;
250 assert!(cs_info.local_size[1] == 1 && cs_info.local_size[2] == 1);
251 let local_size = cs_info.local_size[0];
252
253 // Compute the needed size of the buffer
254 let mut size = 0_usize;
255
256 const MAX_PUSH_DW: usize = 256;
257 let push_offset = size;
258 size = push_offset + 4 * MAX_PUSH_DW;
259
260 const QMD_SIZE: usize = 64 * 4;
261 let qmd_offset = size.next_multiple_of(0x100);
262 size = qmd_offset + 4 * QMD_SIZE;
263
264 let shader_offset = size.next_multiple_of(0x80);
265 size = shader_offset + usize::try_from(shader.code_size).unwrap();
266
267 let cb0_offset = size.next_multiple_of(256);
268 size = cb0_offset + std::mem::size_of::<CB0>();
269
270 let data_offset = size.next_multiple_of(256);
271 size = data_offset + data_size;
272
273 let bo = BO::new(self, size.try_into().unwrap())?;
274
275 // Copy the data from the caller into our BO
276 let data_addr = bo.addr + u64::try_from(data_offset).unwrap();
277 let data_map = bo.map.byte_offset(data_offset.try_into().unwrap());
278 std::ptr::copy(data, data_map, data_size);
279
280 // Fill out cb0
281 let cb0_addr = bo.addr + u64::try_from(cb0_offset).unwrap();
282 let cb0_map = bo.map.byte_offset(cb0_offset.try_into().unwrap());
283 cb0_map.cast::<CB0>().write(CB0 {
284 data_addr_lo: data_addr as u32,
285 data_addr_hi: (data_addr >> 32) as u32,
286 data_stride,
287 invocations,
288 });
289
290 // Upload the shader
291 let shader_addr = bo.addr + u64::try_from(shader_offset).unwrap();
292 let shader_map = bo.map.byte_offset(shader_offset.try_into().unwrap());
293 std::ptr::copy(
294 shader.code,
295 shader_map,
296 shader.code_size.try_into().unwrap(),
297 );
298
299 // Populate and upload the QMD
300 let mut qmd_cbufs: [nak_qmd_cbuf; 8] = unsafe { std::mem::zeroed() };
301 qmd_cbufs[0] = nak_qmd_cbuf {
302 index: 0,
303 size: std::mem::size_of::<CB0>()
304 .next_multiple_of(256)
305 .try_into()
306 .unwrap(),
307 addr: cb0_addr,
308 };
309 let qmd_info = nak_qmd_info {
310 // Pre-Volta, we set the program region to the start of the bo
311 addr: if self.dev_info().cls_compute < VOLTA_COMPUTE_A {
312 shader_offset.try_into().unwrap()
313 } else {
314 shader_addr
315 },
316 smem_size: 0,
317 smem_max: 48 * 1024,
318 global_size: [invocations.div_ceil(local_size.into()), 1, 1],
319 num_cbufs: 1,
320 cbufs: qmd_cbufs,
321 };
322
323 let qmd_addr = bo.addr + u64::try_from(qmd_offset).unwrap();
324 let qmd_map = bo.map.byte_offset(qmd_offset.try_into().unwrap());
325 nak_fill_qmd(
326 self.dev_info(),
327 &shader.info,
328 &qmd_info,
329 qmd_map,
330 QMD_SIZE,
331 );
332
333 // Fill out the pushbuf
334 let mut p = NvPush::new();
335
336 p.push_method(cla0c0::SetObject {
337 class_id: self.dev_info().cls_compute.into(),
338 engine_id: 0,
339 });
340 if self.dev_info().cls_compute < VOLTA_COMPUTE_A {
341 p.push_method(cla0c0::SetProgramRegionA {
342 address_upper: (bo.addr >> 32) as u32,
343 });
344 p.push_method(cla0c0::SetProgramRegionB {
345 address_lower: bo.addr as u32,
346 });
347 }
348
349 let smem_base_addr = 0xfe000000_u32;
350 let lmem_base_addr = 0xff000000_u32;
351 if self.dev_info().cls_compute >= VOLTA_COMPUTE_A {
352 p.push_method(clc3c0::SetShaderSharedMemoryWindowA {
353 base_address_upper: 0,
354 });
355 p.push_method(clc3c0::SetShaderSharedMemoryWindowB {
356 base_address: smem_base_addr,
357 });
358
359 p.push_method(clc3c0::SetShaderLocalMemoryWindowA {
360 base_address_upper: 0,
361 });
362 p.push_method(clc3c0::SetShaderLocalMemoryWindowB {
363 base_address: lmem_base_addr,
364 });
365 } else {
366 p.push_method(cla0c0::SetShaderSharedMemoryWindow {
367 base_address: smem_base_addr,
368 });
369 p.push_method(cla0c0::SetShaderLocalMemoryWindow {
370 base_address: lmem_base_addr,
371 });
372 }
373
374 if self.dev_info().cls_compute >= MAXWELL_COMPUTE_B {
375 p.push_method(clb1c0::InvalidateSkedCaches { v: 0 });
376 }
377
378 p.push_method(cla0c0::SendPcasA {
379 qmd_address_shifted8: (qmd_addr >> 8) as u32,
380 });
381 if self.dev_info().cls_compute >= AMPERE_COMPUTE_A {
382 p.push_method(clc6c0::SendSignalingPcas2B {
383 pcas_action: clc6c0::SendSignalingPcas2BPcasAction::InvalidateCopySchedule,
384 });
385 } else {
386 p.push_method(cla0c0::SendSignalingPcasB {
387 invalidate: true,
388 schedule: true,
389 });
390 }
391
392 let push_addr = bo.addr + u64::try_from(push_offset).unwrap();
393 let push_map = bo.map.byte_offset(push_offset.try_into().unwrap());
394 std::ptr::copy(p.as_ptr(), push_map.cast(), p.len());
395
396 let res = self.exec(push_addr, (p.len() * 4).try_into().unwrap());
397
398 // Always copy the data back to the caller, even if exec fails
399 let data_map = bo.map.byte_offset(data_offset.try_into().unwrap());
400 std::ptr::copy(data_map, data, data_size);
401
402 res
403 }
404
run<T>( &self, shader: &nak_shader_bin, data: &mut [T], ) -> io::Result<()>405 pub fn run<T>(
406 &self,
407 shader: &nak_shader_bin,
408 data: &mut [T],
409 ) -> io::Result<()> {
410 unsafe {
411 let stride = std::mem::size_of::<T>();
412 self.run_raw(
413 shader,
414 data.len().try_into().unwrap(),
415 stride.try_into().unwrap(),
416 data.as_mut_ptr().cast(),
417 data.len() * stride,
418 )
419 }
420 }
421 }
422
423 unsafe impl Sync for Runner {}
424 unsafe impl Send for Runner {}
425