1 /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ 2 /* 3 * Header file for the io_uring interface. 4 * 5 * Copyright (C) 2019 Jens Axboe 6 * Copyright (C) 2019 Christoph Hellwig 7 */ 8 #ifndef LINUX_IO_URING_H 9 #define LINUX_IO_URING_H 10 11 #include <linux/fs.h> 12 #include <linux/types.h> 13 14 #ifdef __cplusplus 15 extern "C" { 16 #endif 17 18 /* 19 * IO submission data structure (Submission Queue Entry) 20 */ 21 struct io_uring_sqe { 22 __u8 opcode; /* type of operation for this sqe */ 23 __u8 flags; /* IOSQE_ flags */ 24 __u16 ioprio; /* ioprio for the request */ 25 __s32 fd; /* file descriptor to do IO on */ 26 union { 27 __u64 off; /* offset into file */ 28 __u64 addr2; 29 }; 30 union { 31 __u64 addr; /* pointer to buffer or iovecs */ 32 __u64 splice_off_in; 33 }; 34 __u32 len; /* buffer size or number of iovecs */ 35 union { 36 __kernel_rwf_t rw_flags; 37 __u32 fsync_flags; 38 __u16 poll_events; /* compatibility */ 39 __u32 poll32_events; /* word-reversed for BE */ 40 __u32 sync_range_flags; 41 __u32 msg_flags; 42 __u32 timeout_flags; 43 __u32 accept_flags; 44 __u32 cancel_flags; 45 __u32 open_flags; 46 __u32 statx_flags; 47 __u32 fadvise_advice; 48 __u32 splice_flags; 49 __u32 rename_flags; 50 __u32 unlink_flags; 51 __u32 hardlink_flags; 52 __u32 xattr_flags; 53 }; 54 __u64 user_data; /* data to be passed back at completion time */ 55 /* pack this to avoid bogus arm OABI complaints */ 56 union { 57 /* index into fixed buffers, if used */ 58 __u16 buf_index; 59 /* for grouped buffer selection */ 60 __u16 buf_group; 61 } __attribute__((packed)); 62 /* personality to use, if used */ 63 __u16 personality; 64 union { 65 __s32 splice_fd_in; 66 __u32 file_index; 67 }; 68 __u64 addr3; 69 __u64 __pad2[1]; 70 }; 71 72 /* 73 * If sqe->file_index is set to this for opcodes that instantiate a new 74 * direct descriptor (like openat/openat2/accept), then io_uring will allocate 75 * an available direct descriptor instead of having the application pass one 76 * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE 77 * if the space is full. 78 */ 79 #define IORING_FILE_INDEX_ALLOC (~0U) 80 81 enum { 82 IOSQE_FIXED_FILE_BIT, 83 IOSQE_IO_DRAIN_BIT, 84 IOSQE_IO_LINK_BIT, 85 IOSQE_IO_HARDLINK_BIT, 86 IOSQE_ASYNC_BIT, 87 IOSQE_BUFFER_SELECT_BIT, 88 IOSQE_CQE_SKIP_SUCCESS_BIT, 89 }; 90 91 /* 92 * sqe->flags 93 */ 94 /* use fixed fileset */ 95 #define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT) 96 /* issue after inflight IO */ 97 #define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT) 98 /* links next sqe */ 99 #define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT) 100 /* like LINK, but stronger */ 101 #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) 102 /* always go async */ 103 #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) 104 /* select buffer from sqe->buf_group */ 105 #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) 106 /* don't post CQE if request succeeded */ 107 #define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT) 108 109 /* 110 * io_uring_setup() flags 111 */ 112 #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ 113 #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ 114 #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ 115 #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ 116 #define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ 117 #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ 118 #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ 119 #define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ 120 /* 121 * Cooperative task running. When requests complete, they often require 122 * forcing the submitter to transition to the kernel to complete. If this 123 * flag is set, work will be done when the task transitions anyway, rather 124 * than force an inter-processor interrupt reschedule. This avoids interrupting 125 * a task running in userspace, and saves an IPI. 126 */ 127 #define IORING_SETUP_COOP_TASKRUN (1U << 8) 128 /* 129 * If COOP_TASKRUN is set, get notified if task work is available for 130 * running and a kernel transition would be needed to run it. This sets 131 * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. 132 */ 133 #define IORING_SETUP_TASKRUN_FLAG (1U << 9) 134 135 #define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */ 136 #define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */ 137 138 enum io_uring_op { 139 IORING_OP_NOP, 140 IORING_OP_READV, 141 IORING_OP_WRITEV, 142 IORING_OP_FSYNC, 143 IORING_OP_READ_FIXED, 144 IORING_OP_WRITE_FIXED, 145 IORING_OP_POLL_ADD, 146 IORING_OP_POLL_REMOVE, 147 IORING_OP_SYNC_FILE_RANGE, 148 IORING_OP_SENDMSG, 149 IORING_OP_RECVMSG, 150 IORING_OP_TIMEOUT, 151 IORING_OP_TIMEOUT_REMOVE, 152 IORING_OP_ACCEPT, 153 IORING_OP_ASYNC_CANCEL, 154 IORING_OP_LINK_TIMEOUT, 155 IORING_OP_CONNECT, 156 IORING_OP_FALLOCATE, 157 IORING_OP_OPENAT, 158 IORING_OP_CLOSE, 159 IORING_OP_FILES_UPDATE, 160 IORING_OP_STATX, 161 IORING_OP_READ, 162 IORING_OP_WRITE, 163 IORING_OP_FADVISE, 164 IORING_OP_MADVISE, 165 IORING_OP_SEND, 166 IORING_OP_RECV, 167 IORING_OP_OPENAT2, 168 IORING_OP_EPOLL_CTL, 169 IORING_OP_SPLICE, 170 IORING_OP_PROVIDE_BUFFERS, 171 IORING_OP_REMOVE_BUFFERS, 172 IORING_OP_TEE, 173 IORING_OP_SHUTDOWN, 174 IORING_OP_RENAMEAT, 175 IORING_OP_UNLINKAT, 176 IORING_OP_MKDIRAT, 177 IORING_OP_SYMLINKAT, 178 IORING_OP_LINKAT, 179 IORING_OP_MSG_RING, 180 IORING_OP_FSETXATTR, 181 IORING_OP_SETXATTR, 182 IORING_OP_FGETXATTR, 183 IORING_OP_GETXATTR, 184 IORING_OP_SOCKET, 185 IORING_OP_URING_CMD, 186 187 /* this goes last, obviously */ 188 IORING_OP_LAST, 189 }; 190 191 /* 192 * sqe->fsync_flags 193 */ 194 #define IORING_FSYNC_DATASYNC (1U << 0) 195 196 /* 197 * sqe->timeout_flags 198 */ 199 #define IORING_TIMEOUT_ABS (1U << 0) 200 #define IORING_TIMEOUT_UPDATE (1U << 1) 201 #define IORING_TIMEOUT_BOOTTIME (1U << 2) 202 #define IORING_TIMEOUT_REALTIME (1U << 3) 203 #define IORING_LINK_TIMEOUT_UPDATE (1U << 4) 204 #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) 205 #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) 206 #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) 207 /* 208 * sqe->splice_flags 209 * extends splice(2) flags 210 */ 211 #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ 212 213 /* 214 * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the 215 * command flags for POLL_ADD are stored in sqe->len. 216 * 217 * IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if 218 * the poll handler will continue to report 219 * CQEs on behalf of the same SQE. 220 * 221 * IORING_POLL_UPDATE Update existing poll request, matching 222 * sqe->addr as the old user_data field. 223 */ 224 #define IORING_POLL_ADD_MULTI (1U << 0) 225 #define IORING_POLL_UPDATE_EVENTS (1U << 1) 226 #define IORING_POLL_UPDATE_USER_DATA (1U << 2) 227 228 /* 229 * ASYNC_CANCEL flags. 230 * 231 * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key 232 * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the 233 * request 'user_data' 234 * IORING_ASYNC_CANCEL_ANY Match any request 235 */ 236 #define IORING_ASYNC_CANCEL_ALL (1U << 0) 237 #define IORING_ASYNC_CANCEL_FD (1U << 1) 238 #define IORING_ASYNC_CANCEL_ANY (1U << 2) 239 240 /* 241 * send/sendmsg and recv/recvmsg flags (sqe->addr2) 242 * 243 * IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send 244 * or receive and arm poll if that yields an 245 * -EAGAIN result, arm poll upfront and skip 246 * the initial transfer attempt. 247 */ 248 #define IORING_RECVSEND_POLL_FIRST (1U << 0) 249 250 /* 251 * accept flags stored in sqe->ioprio 252 */ 253 #define IORING_ACCEPT_MULTISHOT (1U << 0) 254 255 /* 256 * IO completion data structure (Completion Queue Entry) 257 */ 258 struct io_uring_cqe { 259 __u64 user_data; /* sqe->data submission passed back */ 260 __s32 res; /* result code for this event */ 261 __u32 flags; 262 263 /* 264 * If the ring is initialized with IORING_SETUP_CQE32, then this field 265 * contains 16-bytes of padding, doubling the size of the CQE. 266 */ 267 __u64 big_cqe[]; 268 }; 269 270 /* 271 * cqe->flags 272 * 273 * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID 274 * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries 275 * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv 276 */ 277 #define IORING_CQE_F_BUFFER (1U << 0) 278 #define IORING_CQE_F_MORE (1U << 1) 279 #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) 280 281 enum { 282 IORING_CQE_BUFFER_SHIFT = 16, 283 }; 284 285 /* 286 * Magic offsets for the application to mmap the data it needs 287 */ 288 #define IORING_OFF_SQ_RING 0ULL 289 #define IORING_OFF_CQ_RING 0x8000000ULL 290 #define IORING_OFF_SQES 0x10000000ULL 291 292 /* 293 * Filled with the offset for mmap(2) 294 */ 295 struct io_sqring_offsets { 296 __u32 head; 297 __u32 tail; 298 __u32 ring_mask; 299 __u32 ring_entries; 300 __u32 flags; 301 __u32 dropped; 302 __u32 array; 303 __u32 resv1; 304 __u64 resv2; 305 }; 306 307 /* 308 * sq_ring->flags 309 */ 310 #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ 311 #define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ 312 #define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */ 313 314 struct io_cqring_offsets { 315 __u32 head; 316 __u32 tail; 317 __u32 ring_mask; 318 __u32 ring_entries; 319 __u32 overflow; 320 __u32 cqes; 321 __u32 flags; 322 __u32 resv1; 323 __u64 resv2; 324 }; 325 326 /* 327 * cq_ring->flags 328 */ 329 330 /* disable eventfd notifications */ 331 #define IORING_CQ_EVENTFD_DISABLED (1U << 0) 332 333 /* 334 * io_uring_enter(2) flags 335 */ 336 #define IORING_ENTER_GETEVENTS (1U << 0) 337 #define IORING_ENTER_SQ_WAKEUP (1U << 1) 338 #define IORING_ENTER_SQ_WAIT (1U << 2) 339 #define IORING_ENTER_EXT_ARG (1U << 3) 340 #define IORING_ENTER_REGISTERED_RING (1U << 4) 341 342 /* 343 * Passed in for io_uring_setup(2). Copied back with updated info on success 344 */ 345 struct io_uring_params { 346 __u32 sq_entries; 347 __u32 cq_entries; 348 __u32 flags; 349 __u32 sq_thread_cpu; 350 __u32 sq_thread_idle; 351 __u32 features; 352 __u32 wq_fd; 353 __u32 resv[3]; 354 struct io_sqring_offsets sq_off; 355 struct io_cqring_offsets cq_off; 356 }; 357 358 /* 359 * io_uring_params->features flags 360 */ 361 #define IORING_FEAT_SINGLE_MMAP (1U << 0) 362 #define IORING_FEAT_NODROP (1U << 1) 363 #define IORING_FEAT_SUBMIT_STABLE (1U << 2) 364 #define IORING_FEAT_RW_CUR_POS (1U << 3) 365 #define IORING_FEAT_CUR_PERSONALITY (1U << 4) 366 #define IORING_FEAT_FAST_POLL (1U << 5) 367 #define IORING_FEAT_POLL_32BITS (1U << 6) 368 #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) 369 #define IORING_FEAT_EXT_ARG (1U << 8) 370 #define IORING_FEAT_NATIVE_WORKERS (1U << 9) 371 #define IORING_FEAT_RSRC_TAGS (1U << 10) 372 #define IORING_FEAT_CQE_SKIP (1U << 11) 373 #define IORING_FEAT_LINKED_FILE (1U << 12) 374 375 /* 376 * io_uring_register(2) opcodes and arguments 377 */ 378 enum { 379 IORING_REGISTER_BUFFERS = 0, 380 IORING_UNREGISTER_BUFFERS = 1, 381 IORING_REGISTER_FILES = 2, 382 IORING_UNREGISTER_FILES = 3, 383 IORING_REGISTER_EVENTFD = 4, 384 IORING_UNREGISTER_EVENTFD = 5, 385 IORING_REGISTER_FILES_UPDATE = 6, 386 IORING_REGISTER_EVENTFD_ASYNC = 7, 387 IORING_REGISTER_PROBE = 8, 388 IORING_REGISTER_PERSONALITY = 9, 389 IORING_UNREGISTER_PERSONALITY = 10, 390 IORING_REGISTER_RESTRICTIONS = 11, 391 IORING_REGISTER_ENABLE_RINGS = 12, 392 393 /* extended with tagging */ 394 IORING_REGISTER_FILES2 = 13, 395 IORING_REGISTER_FILES_UPDATE2 = 14, 396 IORING_REGISTER_BUFFERS2 = 15, 397 IORING_REGISTER_BUFFERS_UPDATE = 16, 398 399 /* set/clear io-wq thread affinities */ 400 IORING_REGISTER_IOWQ_AFF = 17, 401 IORING_UNREGISTER_IOWQ_AFF = 18, 402 403 /* set/get max number of io-wq workers */ 404 IORING_REGISTER_IOWQ_MAX_WORKERS = 19, 405 406 /* register/unregister io_uring fd with the ring */ 407 IORING_REGISTER_RING_FDS = 20, 408 IORING_UNREGISTER_RING_FDS = 21, 409 410 /* register ring based provide buffer group */ 411 IORING_REGISTER_PBUF_RING = 22, 412 IORING_UNREGISTER_PBUF_RING = 23, 413 414 /* this goes last */ 415 IORING_REGISTER_LAST 416 }; 417 418 /* io-wq worker categories */ 419 enum { 420 IO_WQ_BOUND, 421 IO_WQ_UNBOUND, 422 }; 423 424 /* deprecated, see struct io_uring_rsrc_update */ 425 struct io_uring_files_update { 426 __u32 offset; 427 __u32 resv; 428 __aligned_u64 /* __s32 * */ fds; 429 }; 430 431 /* 432 * Register a fully sparse file space, rather than pass in an array of all 433 * -1 file descriptors. 434 */ 435 #define IORING_RSRC_REGISTER_SPARSE (1U << 0) 436 437 struct io_uring_rsrc_register { 438 __u32 nr; 439 __u32 flags; 440 __u64 resv2; 441 __aligned_u64 data; 442 __aligned_u64 tags; 443 }; 444 445 struct io_uring_rsrc_update { 446 __u32 offset; 447 __u32 resv; 448 __aligned_u64 data; 449 }; 450 451 struct io_uring_rsrc_update2 { 452 __u32 offset; 453 __u32 resv; 454 __aligned_u64 data; 455 __aligned_u64 tags; 456 __u32 nr; 457 __u32 resv2; 458 }; 459 460 /* Skip updating fd indexes set to this value in the fd table */ 461 #define IORING_REGISTER_FILES_SKIP (-2) 462 463 #define IO_URING_OP_SUPPORTED (1U << 0) 464 465 struct io_uring_probe_op { 466 __u8 op; 467 __u8 resv; 468 __u16 flags; /* IO_URING_OP_* flags */ 469 __u32 resv2; 470 }; 471 472 struct io_uring_probe { 473 __u8 last_op; /* last opcode supported */ 474 __u8 ops_len; /* length of ops[] array below */ 475 __u16 resv; 476 __u32 resv2[3]; 477 struct io_uring_probe_op ops[0]; 478 }; 479 480 struct io_uring_restriction { 481 __u16 opcode; 482 union { 483 __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */ 484 __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */ 485 __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */ 486 }; 487 __u8 resv; 488 __u32 resv2[3]; 489 }; 490 491 struct io_uring_buf { 492 __u64 addr; 493 __u32 len; 494 __u16 bid; 495 __u16 resv; 496 }; 497 498 struct io_uring_buf_ring { 499 union { 500 /* 501 * To avoid spilling into more pages than we need to, the 502 * ring tail is overlaid with the io_uring_buf->resv field. 503 */ 504 struct { 505 __u64 resv1; 506 __u32 resv2; 507 __u16 resv3; 508 __u16 tail; 509 }; 510 struct io_uring_buf bufs[0]; 511 }; 512 }; 513 514 /* argument for IORING_(UN)REGISTER_PBUF_RING */ 515 struct io_uring_buf_reg { 516 __u64 ring_addr; 517 __u32 ring_entries; 518 __u16 bgid; 519 __u16 pad; 520 __u64 resv[3]; 521 }; 522 523 /* 524 * io_uring_restriction->opcode values 525 */ 526 enum { 527 /* Allow an io_uring_register(2) opcode */ 528 IORING_RESTRICTION_REGISTER_OP = 0, 529 530 /* Allow an sqe opcode */ 531 IORING_RESTRICTION_SQE_OP = 1, 532 533 /* Allow sqe flags */ 534 IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2, 535 536 /* Require sqe flags (these flags must be set on each submission) */ 537 IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, 538 539 IORING_RESTRICTION_LAST 540 }; 541 542 struct io_uring_getevents_arg { 543 __u64 sigmask; 544 __u32 sigmask_sz; 545 __u32 pad; 546 __u64 ts; 547 }; 548 549 /* 550 * accept flags stored in sqe->ioprio 551 */ 552 #define IORING_ACCEPT_MULTISHOT (1U << 0) 553 554 #ifdef __cplusplus 555 } 556 #endif 557 558 #endif 559