1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::collections::BTreeMap;
6 use std::fs::File;
7 use std::io;
8 use std::mem::size_of;
9 use std::time::Duration;
10
11 use anyhow::anyhow;
12 use anyhow::Context;
13 use base::error;
14 use base::AsRawDescriptor;
15 use base::Error as SysError;
16 use base::Event;
17 use base::RawDescriptor;
18 use base::Result as SysResult;
19 use base::Timer;
20 use base::Tube;
21 use base::TubeError;
22 use base::WorkerThread;
23 use cros_async::select3;
24 use cros_async::select4;
25 use cros_async::AsyncError;
26 use cros_async::EventAsync;
27 use cros_async::Executor;
28 use cros_async::TimerAsync;
29 use data_model::Le32;
30 use data_model::Le64;
31 use futures::pin_mut;
32 use remain::sorted;
33 use thiserror::Error;
34 use vm_control::MemSlot;
35 use vm_control::VmMemoryMappingRequest;
36 use vm_control::VmMemoryMappingResponse;
37 use vm_memory::GuestAddress;
38 use vm_memory::GuestMemory;
39 use zerocopy::AsBytes;
40 use zerocopy::FromBytes;
41 use zerocopy::FromZeroes;
42
43 use super::async_utils;
44 use super::copy_config;
45 use super::DescriptorChain;
46 use super::DeviceType;
47 use super::Interrupt;
48 use super::Queue;
49 use super::VirtioDevice;
50
51 const QUEUE_SIZE: u16 = 256;
52 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
53
54 /* Feature bits */
55 const VIRTIO_PMEM_F_DISCARD: u32 = 63;
56
57 const VIRTIO_PMEM_REQ_TYPE_FLUSH: u32 = 0;
58 const VIRTIO_PMEM_REQ_TYPE_DISCARD: u32 = u32::MAX;
59 const VIRTIO_PMEM_RESP_TYPE_OK: u32 = 0;
60 const VIRTIO_PMEM_RESP_TYPE_EIO: u32 = 1;
61
62 #[derive(Copy, Clone, Debug, Default, AsBytes, FromZeroes, FromBytes)]
63 #[repr(C)]
64 struct virtio_pmem_config {
65 start_address: Le64,
66 size: Le64,
67 }
68
69 #[derive(Copy, Clone, Debug, Default, AsBytes, FromZeroes, FromBytes)]
70 #[repr(C)]
71 struct virtio_pmem_resp {
72 status_code: Le32,
73 }
74
75 #[derive(Copy, Clone, Debug, Default, AsBytes, FromZeroes, FromBytes)]
76 #[repr(C)]
77 struct virtio_pmem_req {
78 type_: Le32,
79 }
80
81 #[derive(Copy, Clone, Debug, Default, AsBytes, FromZeroes, FromBytes)]
82 #[repr(C)]
83 struct virtio_pmem_range_req {
84 type_: Le32,
85 padding_: Le32,
86 start_address: Le64,
87 size: Le64,
88 }
89
90 #[sorted]
91 #[derive(Error, Debug)]
92 enum Error {
93 /// Failed to get value from pageout timer.
94 #[error("failed to get value from pageout timer: {0}")]
95 PageoutTimer(AsyncError),
96 /// Failed to read from virtqueue.
97 #[error("failed to read from virtqueue: {0}")]
98 ReadQueue(io::Error),
99 /// Failed to receive tube response.
100 #[error("failed to receive tube response: {0}")]
101 ReceiveResponse(TubeError),
102 /// Failed to send tube request.
103 #[error("failed to send tube request: {0}")]
104 SendingRequest(TubeError),
105 /// Failed to write to virtqueue.
106 #[error("failed to write to virtqueue: {0}")]
107 WriteQueue(io::Error),
108 }
109
110 type Result<T> = ::std::result::Result<T, Error>;
111
pageout( ex: &Executor, swap_interval: Duration, pmem_device_tube: &Tube, mapping_arena_slot: u32, mapping_size: usize, ) -> Result<()>112 async fn pageout(
113 ex: &Executor,
114 swap_interval: Duration,
115 pmem_device_tube: &Tube,
116 mapping_arena_slot: u32,
117 mapping_size: usize,
118 ) -> Result<()> {
119 let timer = Timer::new().expect("Failed to create a timer");
120 let mut pageout_timer =
121 TimerAsync::new(timer, ex).expect("Failed to create an async pageout timer");
122 pageout_timer
123 .reset_repeating(swap_interval)
124 .expect("Failed to reset pageout timer");
125
126 loop {
127 pageout_timer.wait().await.map_err(Error::PageoutTimer)?;
128 let request = VmMemoryMappingRequest::MadvisePageout {
129 slot: mapping_arena_slot,
130 offset: 0,
131 size: mapping_size,
132 };
133
134 pmem_device_tube
135 .send(&request)
136 .map_err(Error::SendingRequest)?;
137 match pmem_device_tube
138 .recv::<VmMemoryMappingResponse>()
139 .map_err(Error::ReceiveResponse)?
140 {
141 VmMemoryMappingResponse::Ok => {}
142 VmMemoryMappingResponse::Err(e) => {
143 error!("failed to page out the memory mapping: {}", e);
144 }
145 };
146 }
147 }
148
execute_request( request_type: u32, start_address: u64, size: u64, pmem_device_tube: &Tube, mapping_arena_slot: u32, mapping_size: usize, ) -> u32149 fn execute_request(
150 request_type: u32,
151 start_address: u64,
152 size: u64,
153 pmem_device_tube: &Tube,
154 mapping_arena_slot: u32,
155 mapping_size: usize,
156 ) -> u32 {
157 match request_type {
158 VIRTIO_PMEM_REQ_TYPE_FLUSH => {
159 let request = VmMemoryMappingRequest::MsyncArena {
160 slot: mapping_arena_slot,
161 offset: 0, // The pmem backing file is always at offset 0 in the arena.
162 size: mapping_size,
163 };
164
165 if let Err(e) = pmem_device_tube.send(&request) {
166 error!("failed to send request: {}", e);
167 return VIRTIO_PMEM_RESP_TYPE_EIO;
168 }
169
170 match pmem_device_tube.recv() {
171 Ok(response) => match response {
172 VmMemoryMappingResponse::Ok => VIRTIO_PMEM_RESP_TYPE_OK,
173 VmMemoryMappingResponse::Err(e) => {
174 error!("failed flushing disk image: {}", e);
175 VIRTIO_PMEM_RESP_TYPE_EIO
176 }
177 },
178 Err(e) => {
179 error!("failed to receive data: {}", e);
180 VIRTIO_PMEM_RESP_TYPE_EIO
181 }
182 }
183 }
184
185 VIRTIO_PMEM_REQ_TYPE_DISCARD => {
186 let request = VmMemoryMappingRequest::MadviseRemove {
187 slot: mapping_arena_slot,
188 offset: usize::try_from(start_address).unwrap(),
189 size: usize::try_from(size).unwrap(),
190 };
191
192 if let Err(e) = pmem_device_tube.send(&request) {
193 error!("failed to send request: {}", e);
194 return VIRTIO_PMEM_RESP_TYPE_EIO;
195 }
196
197 match pmem_device_tube.recv() {
198 Ok(response) => match response {
199 VmMemoryMappingResponse::Ok => VIRTIO_PMEM_RESP_TYPE_OK,
200 VmMemoryMappingResponse::Err(e) => {
201 error!("failed to discard memory range: {}", e);
202 VIRTIO_PMEM_RESP_TYPE_EIO
203 }
204 },
205 Err(e) => {
206 error!("failed to receive data: {}", e);
207 VIRTIO_PMEM_RESP_TYPE_EIO
208 }
209 }
210 }
211
212 _ => {
213 error!("unknown request type: {}", request_type);
214 VIRTIO_PMEM_RESP_TYPE_EIO
215 }
216 }
217 }
218
handle_request( avail_desc: &mut DescriptorChain, pmem_device_tube: &Tube, mapping_arena_slot: u32, mapping_size: usize, ) -> Result<usize>219 fn handle_request(
220 avail_desc: &mut DescriptorChain,
221 pmem_device_tube: &Tube,
222 mapping_arena_slot: u32,
223 mapping_size: usize,
224 ) -> Result<usize> {
225 let (request_type, start_address, size) =
226 if avail_desc.reader.available_bytes() == size_of::<virtio_pmem_req>() {
227 let request = avail_desc
228 .reader
229 .read_obj::<virtio_pmem_req>()
230 .map_err(Error::ReadQueue)?;
231 (request.type_.to_native(), 0, 0)
232 } else {
233 let request = avail_desc
234 .reader
235 .read_obj::<virtio_pmem_range_req>()
236 .map_err(Error::ReadQueue)?;
237 (
238 request.type_.to_native(),
239 request.start_address.to_native(),
240 request.size.to_native(),
241 )
242 };
243 let status_code = execute_request(
244 request_type,
245 start_address,
246 size,
247 pmem_device_tube,
248 mapping_arena_slot,
249 mapping_size,
250 );
251
252 let response = virtio_pmem_resp {
253 status_code: status_code.into(),
254 };
255
256 avail_desc
257 .writer
258 .write_obj(response)
259 .map_err(Error::WriteQueue)?;
260
261 Ok(avail_desc.writer.bytes_written())
262 }
263
handle_queue( queue: &mut Queue, mut queue_event: EventAsync, pmem_device_tube: &Tube, mapping_arena_slot: u32, mapping_size: usize, )264 async fn handle_queue(
265 queue: &mut Queue,
266 mut queue_event: EventAsync,
267 pmem_device_tube: &Tube,
268 mapping_arena_slot: u32,
269 mapping_size: usize,
270 ) {
271 loop {
272 let mut avail_desc = match queue.next_async(&mut queue_event).await {
273 Err(e) => {
274 error!("Failed to read descriptor {}", e);
275 return;
276 }
277 Ok(d) => d,
278 };
279
280 let written = match handle_request(
281 &mut avail_desc,
282 pmem_device_tube,
283 mapping_arena_slot,
284 mapping_size,
285 ) {
286 Ok(n) => n,
287 Err(e) => {
288 error!("pmem: failed to handle request: {}", e);
289 0
290 }
291 };
292 queue.add_used(avail_desc, written as u32);
293 queue.trigger_interrupt();
294 }
295 }
296
run_worker( queue: &mut Queue, pmem_device_tube: &Tube, interrupt: Interrupt, kill_evt: Event, mapping_arena_slot: u32, mapping_size: usize, swap_interval: Option<Duration>, )297 fn run_worker(
298 queue: &mut Queue,
299 pmem_device_tube: &Tube,
300 interrupt: Interrupt,
301 kill_evt: Event,
302 mapping_arena_slot: u32,
303 mapping_size: usize,
304 swap_interval: Option<Duration>,
305 ) {
306 let ex = Executor::new().unwrap();
307
308 let queue_evt = queue
309 .event()
310 .try_clone()
311 .expect("failed to clone queue event");
312 let queue_evt = EventAsync::new(queue_evt, &ex).expect("failed to set up the queue event");
313
314 // Process requests from the virtio queue.
315 let queue_fut = handle_queue(
316 queue,
317 queue_evt,
318 pmem_device_tube,
319 mapping_arena_slot,
320 mapping_size,
321 );
322 pin_mut!(queue_fut);
323
324 // Process any requests to resample the irq value.
325 let resample = async_utils::handle_irq_resample(&ex, interrupt);
326 pin_mut!(resample);
327
328 // Exit if the kill event is triggered.
329 let kill = async_utils::await_and_exit(&ex, kill_evt);
330 pin_mut!(kill);
331
332 let interval = swap_interval.unwrap_or(Duration::ZERO);
333 if interval.is_zero() {
334 if let Err(e) = ex.run_until(select3(queue_fut, resample, kill)) {
335 error!("error happened in executor: {}", e);
336 }
337 } else {
338 let pageout_fut = pageout(
339 &ex,
340 interval,
341 pmem_device_tube,
342 mapping_arena_slot,
343 mapping_size,
344 );
345 pin_mut!(pageout_fut);
346 if let Err(e) = ex.run_until(select4(queue_fut, resample, kill, pageout_fut)) {
347 error!("error happened in executor: {}", e);
348 }
349 }
350 }
351
352 /// Specifies how memory slot is initialized.
353 pub enum MemSlotConfig {
354 /// The memory region has already been mapped to the guest.
355 MemSlot {
356 /// index of the guest-mapped memory regions.
357 idx: MemSlot,
358 },
359 /// The memory region that is not initialized yet and whose slot index will be provided via
360 /// `Tube` later. e.g. pmem-ext2 device, where fs construction will be done in the main
361 /// process.
362 LazyInit { tube: Tube },
363 }
364
365 pub struct Pmem {
366 worker_thread: Option<WorkerThread<(Queue, Tube)>>,
367 features: u64,
368 disk_image: Option<File>,
369 mapping_address: GuestAddress,
370 mem_slot: MemSlotConfig,
371 mapping_size: u64,
372 pmem_device_tube: Option<Tube>,
373 swap_interval: Option<Duration>,
374 }
375
376 #[derive(serde::Serialize, serde::Deserialize)]
377 struct PmemSnapshot {
378 mapping_address: GuestAddress,
379 mapping_size: u64,
380 }
381
382 /// Configuration of a virtio-pmem device.
383 pub struct PmemConfig {
384 /// Disk image exposed to the guest.
385 /// If the memory region is not backed by a file, this should be `None`.
386 pub disk_image: Option<File>,
387 /// Guest physical address where the memory will be mapped.
388 pub mapping_address: GuestAddress,
389 pub mem_slot: MemSlotConfig,
390 /// The size of the mapped region.
391 pub mapping_size: u64,
392 /// A communication channel to the main process to send memory requests.
393 pub pmem_device_tube: Tube,
394 /// Interval for periodic swap out of memory mapping
395 pub swap_interval: Option<Duration>,
396 /// Whether the region is writeble or not.
397 pub mapping_writable: bool,
398 }
399
400 impl Pmem {
new(base_features: u64, cfg: PmemConfig) -> SysResult<Pmem>401 pub fn new(base_features: u64, cfg: PmemConfig) -> SysResult<Pmem> {
402 if cfg.mapping_size > usize::MAX as u64 {
403 return Err(SysError::new(libc::EOVERFLOW));
404 }
405
406 let mut avail_features = base_features;
407 if cfg.mapping_writable {
408 if let MemSlotConfig::LazyInit { .. } = cfg.mem_slot {
409 error!("pmem-ext2 must be a read-only device");
410 return Err(SysError::new(libc::EINVAL));
411 }
412
413 avail_features |= 1 << VIRTIO_PMEM_F_DISCARD;
414 }
415
416 Ok(Pmem {
417 worker_thread: None,
418 features: avail_features,
419 disk_image: cfg.disk_image,
420 mapping_address: cfg.mapping_address,
421 mem_slot: cfg.mem_slot,
422 mapping_size: cfg.mapping_size,
423 pmem_device_tube: Some(cfg.pmem_device_tube),
424 swap_interval: cfg.swap_interval,
425 })
426 }
427 }
428
429 impl VirtioDevice for Pmem {
keep_rds(&self) -> Vec<RawDescriptor>430 fn keep_rds(&self) -> Vec<RawDescriptor> {
431 let mut keep_rds = Vec::new();
432 if let Some(disk_image) = &self.disk_image {
433 keep_rds.push(disk_image.as_raw_descriptor());
434 }
435
436 if let Some(ref pmem_device_tube) = self.pmem_device_tube {
437 keep_rds.push(pmem_device_tube.as_raw_descriptor());
438 }
439
440 if let MemSlotConfig::LazyInit { tube } = &self.mem_slot {
441 keep_rds.push(tube.as_raw_descriptor());
442 }
443
444 keep_rds
445 }
446
device_type(&self) -> DeviceType447 fn device_type(&self) -> DeviceType {
448 DeviceType::Pmem
449 }
450
queue_max_sizes(&self) -> &[u16]451 fn queue_max_sizes(&self) -> &[u16] {
452 QUEUE_SIZES
453 }
454
features(&self) -> u64455 fn features(&self) -> u64 {
456 self.features
457 }
458
read_config(&self, offset: u64, data: &mut [u8])459 fn read_config(&self, offset: u64, data: &mut [u8]) {
460 let config = virtio_pmem_config {
461 start_address: Le64::from(self.mapping_address.offset()),
462 size: Le64::from(self.mapping_size),
463 };
464 copy_config(data, 0, config.as_bytes(), offset);
465 }
466
activate( &mut self, _memory: GuestMemory, interrupt: Interrupt, mut queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>467 fn activate(
468 &mut self,
469 _memory: GuestMemory,
470 interrupt: Interrupt,
471 mut queues: BTreeMap<usize, Queue>,
472 ) -> anyhow::Result<()> {
473 if queues.len() != 1 {
474 return Err(anyhow!("expected 1 queue, got {}", queues.len()));
475 }
476
477 let mut queue = queues.remove(&0).unwrap();
478
479 // We checked that this fits in a usize in `Pmem::new`.
480 let mapping_size = self.mapping_size as usize;
481
482 let pmem_device_tube = self
483 .pmem_device_tube
484 .take()
485 .context("missing pmem device tube")?;
486
487 let swap_interval = self.swap_interval;
488
489 let mapping_arena_slot = match &self.mem_slot {
490 MemSlotConfig::MemSlot { idx } => *idx,
491 MemSlotConfig::LazyInit { tube } => tube
492 .recv::<u32>()
493 .context("failed to receive memory slot for ext2 pmem device")?,
494 };
495
496 self.worker_thread = Some(WorkerThread::start("v_pmem", move |kill_event| {
497 run_worker(
498 &mut queue,
499 &pmem_device_tube,
500 interrupt,
501 kill_event,
502 mapping_arena_slot,
503 mapping_size,
504 swap_interval,
505 );
506 (queue, pmem_device_tube)
507 }));
508
509 Ok(())
510 }
511
reset(&mut self) -> anyhow::Result<()>512 fn reset(&mut self) -> anyhow::Result<()> {
513 if let Some(worker_thread) = self.worker_thread.take() {
514 let (_queue, pmem_device_tube) = worker_thread.stop();
515 self.pmem_device_tube = Some(pmem_device_tube);
516 }
517 Ok(())
518 }
519
virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>>520 fn virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>> {
521 if let Some(worker_thread) = self.worker_thread.take() {
522 let (queue, pmem_device_tube) = worker_thread.stop();
523 self.pmem_device_tube = Some(pmem_device_tube);
524 return Ok(Some(BTreeMap::from([(0, queue)])));
525 }
526 Ok(None)
527 }
528
virtio_wake( &mut self, queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>, ) -> anyhow::Result<()>529 fn virtio_wake(
530 &mut self,
531 queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>,
532 ) -> anyhow::Result<()> {
533 if let Some((mem, interrupt, queues)) = queues_state {
534 self.activate(mem, interrupt, queues)?;
535 }
536 Ok(())
537 }
538
virtio_snapshot(&mut self) -> anyhow::Result<serde_json::Value>539 fn virtio_snapshot(&mut self) -> anyhow::Result<serde_json::Value> {
540 serde_json::to_value(PmemSnapshot {
541 mapping_address: self.mapping_address,
542 mapping_size: self.mapping_size,
543 })
544 .context("failed to serialize pmem snapshot")
545 }
546
virtio_restore(&mut self, data: serde_json::Value) -> anyhow::Result<()>547 fn virtio_restore(&mut self, data: serde_json::Value) -> anyhow::Result<()> {
548 let snapshot: PmemSnapshot =
549 serde_json::from_value(data).context("failed to deserialize pmem snapshot")?;
550 anyhow::ensure!(
551 snapshot.mapping_address == self.mapping_address
552 && snapshot.mapping_size == self.mapping_size,
553 "pmem snapshot doesn't match config: expected {:?}, got {:?}",
554 (self.mapping_address, self.mapping_size),
555 (snapshot.mapping_address, snapshot.mapping_size),
556 );
557 Ok(())
558 }
559 }
560