1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * NVMe PCI Endpoint Function target driver.
4 *
5 * Copyright (c) 2024, Western Digital Corporation or its affiliates.
6 * Copyright (c) 2024, Rick Wertenbroek <[email protected]>
7 * REDS Institute, HEIG-VD, HES-SO, Switzerland
8 */
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11 #include <linux/delay.h>
12 #include <linux/dmaengine.h>
13 #include <linux/io.h>
14 #include <linux/mempool.h>
15 #include <linux/module.h>
16 #include <linux/mutex.h>
17 #include <linux/nvme.h>
18 #include <linux/pci_ids.h>
19 #include <linux/pci-epc.h>
20 #include <linux/pci-epf.h>
21 #include <linux/pci_regs.h>
22 #include <linux/slab.h>
23
24 #include "nvmet.h"
25
26 static LIST_HEAD(nvmet_pci_epf_ports);
27 static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex);
28
29 /*
30 * Default and maximum allowed data transfer size. For the default,
31 * allow up to 128 page-sized segments. For the maximum allowed,
32 * use 4 times the default (which is completely arbitrary).
33 */
34 #define NVMET_PCI_EPF_MAX_SEGS 128
35 #define NVMET_PCI_EPF_MDTS_KB \
36 (NVMET_PCI_EPF_MAX_SEGS << (PAGE_SHIFT - 10))
37 #define NVMET_PCI_EPF_MAX_MDTS_KB (NVMET_PCI_EPF_MDTS_KB * 4)
38
39 /*
40 * IRQ vector coalescing threshold: by default, post 8 CQEs before raising an
41 * interrupt vector to the host. This default 8 is completely arbitrary and can
42 * be changed by the host with a nvme_set_features command.
43 */
44 #define NVMET_PCI_EPF_IV_THRESHOLD 8
45
46 /*
47 * BAR CC register and SQ polling intervals.
48 */
49 #define NVMET_PCI_EPF_CC_POLL_INTERVAL msecs_to_jiffies(10)
50 #define NVMET_PCI_EPF_SQ_POLL_INTERVAL msecs_to_jiffies(5)
51 #define NVMET_PCI_EPF_SQ_POLL_IDLE msecs_to_jiffies(5000)
52
53 /*
54 * SQ arbitration burst default: fetch at most 8 commands at a time from an SQ.
55 */
56 #define NVMET_PCI_EPF_SQ_AB 8
57
58 /*
59 * Handling of CQs is normally immediate, unless we fail to map a CQ or the CQ
60 * is full, in which case we retry the CQ processing after this interval.
61 */
62 #define NVMET_PCI_EPF_CQ_RETRY_INTERVAL msecs_to_jiffies(1)
63
64 enum nvmet_pci_epf_queue_flags {
65 NVMET_PCI_EPF_Q_IS_SQ = 0, /* The queue is a submission queue */
66 NVMET_PCI_EPF_Q_LIVE, /* The queue is live */
67 NVMET_PCI_EPF_Q_IRQ_ENABLED, /* IRQ is enabled for this queue */
68 };
69
70 /*
71 * IRQ vector descriptor.
72 */
73 struct nvmet_pci_epf_irq_vector {
74 unsigned int vector;
75 unsigned int ref;
76 bool cd;
77 int nr_irqs;
78 };
79
80 struct nvmet_pci_epf_queue {
81 union {
82 struct nvmet_sq nvme_sq;
83 struct nvmet_cq nvme_cq;
84 };
85 struct nvmet_pci_epf_ctrl *ctrl;
86 unsigned long flags;
87
88 u64 pci_addr;
89 size_t pci_size;
90 struct pci_epc_map pci_map;
91
92 u16 qid;
93 u16 depth;
94 u16 vector;
95 u16 head;
96 u16 tail;
97 u16 phase;
98 u32 db;
99
100 size_t qes;
101
102 struct nvmet_pci_epf_irq_vector *iv;
103 struct workqueue_struct *iod_wq;
104 struct delayed_work work;
105 spinlock_t lock;
106 struct list_head list;
107 };
108
109 /*
110 * PCI Root Complex (RC) address data segment for mapping an admin or
111 * I/O command buffer @buf of @length bytes to the PCI address @pci_addr.
112 */
113 struct nvmet_pci_epf_segment {
114 void *buf;
115 u64 pci_addr;
116 u32 length;
117 };
118
119 /*
120 * Command descriptors.
121 */
122 struct nvmet_pci_epf_iod {
123 struct list_head link;
124
125 struct nvmet_req req;
126 struct nvme_command cmd;
127 struct nvme_completion cqe;
128 unsigned int status;
129
130 struct nvmet_pci_epf_ctrl *ctrl;
131
132 struct nvmet_pci_epf_queue *sq;
133 struct nvmet_pci_epf_queue *cq;
134
135 /* Data transfer size and direction for the command. */
136 size_t data_len;
137 enum dma_data_direction dma_dir;
138
139 /*
140 * PCI Root Complex (RC) address data segments: if nr_data_segs is 1, we
141 * use only @data_seg. Otherwise, the array of segments @data_segs is
142 * allocated to manage multiple PCI address data segments. @data_sgl and
143 * @data_sgt are used to setup the command request for execution by the
144 * target core.
145 */
146 unsigned int nr_data_segs;
147 struct nvmet_pci_epf_segment data_seg;
148 struct nvmet_pci_epf_segment *data_segs;
149 struct scatterlist data_sgl;
150 struct sg_table data_sgt;
151
152 struct work_struct work;
153 struct completion done;
154 };
155
156 /*
157 * PCI target controller private data.
158 */
159 struct nvmet_pci_epf_ctrl {
160 struct nvmet_pci_epf *nvme_epf;
161 struct nvmet_port *port;
162 struct nvmet_ctrl *tctrl;
163 struct device *dev;
164
165 unsigned int nr_queues;
166 struct nvmet_pci_epf_queue *sq;
167 struct nvmet_pci_epf_queue *cq;
168 unsigned int sq_ab;
169
170 mempool_t iod_pool;
171 void *bar;
172 u64 cap;
173 u32 cc;
174 u32 csts;
175
176 size_t io_sqes;
177 size_t io_cqes;
178
179 size_t mps_shift;
180 size_t mps;
181 size_t mps_mask;
182
183 unsigned int mdts;
184
185 struct delayed_work poll_cc;
186 struct delayed_work poll_sqs;
187
188 struct mutex irq_lock;
189 struct nvmet_pci_epf_irq_vector *irq_vectors;
190 unsigned int irq_vector_threshold;
191
192 bool link_up;
193 bool enabled;
194 };
195
196 /*
197 * PCI EPF driver private data.
198 */
199 struct nvmet_pci_epf {
200 struct pci_epf *epf;
201
202 const struct pci_epc_features *epc_features;
203
204 void *reg_bar;
205 size_t msix_table_offset;
206
207 unsigned int irq_type;
208 unsigned int nr_vectors;
209
210 struct nvmet_pci_epf_ctrl ctrl;
211
212 bool dma_enabled;
213 struct dma_chan *dma_tx_chan;
214 struct mutex dma_tx_lock;
215 struct dma_chan *dma_rx_chan;
216 struct mutex dma_rx_lock;
217
218 struct mutex mmio_lock;
219
220 /* PCI endpoint function configfs attributes. */
221 struct config_group group;
222 __le16 portid;
223 char subsysnqn[NVMF_NQN_SIZE];
224 unsigned int mdts_kb;
225 };
226
nvmet_pci_epf_bar_read32(struct nvmet_pci_epf_ctrl * ctrl,u32 off)227 static inline u32 nvmet_pci_epf_bar_read32(struct nvmet_pci_epf_ctrl *ctrl,
228 u32 off)
229 {
230 __le32 *bar_reg = ctrl->bar + off;
231
232 return le32_to_cpu(READ_ONCE(*bar_reg));
233 }
234
nvmet_pci_epf_bar_write32(struct nvmet_pci_epf_ctrl * ctrl,u32 off,u32 val)235 static inline void nvmet_pci_epf_bar_write32(struct nvmet_pci_epf_ctrl *ctrl,
236 u32 off, u32 val)
237 {
238 __le32 *bar_reg = ctrl->bar + off;
239
240 WRITE_ONCE(*bar_reg, cpu_to_le32(val));
241 }
242
nvmet_pci_epf_bar_read64(struct nvmet_pci_epf_ctrl * ctrl,u32 off)243 static inline u64 nvmet_pci_epf_bar_read64(struct nvmet_pci_epf_ctrl *ctrl,
244 u32 off)
245 {
246 return (u64)nvmet_pci_epf_bar_read32(ctrl, off) |
247 ((u64)nvmet_pci_epf_bar_read32(ctrl, off + 4) << 32);
248 }
249
nvmet_pci_epf_bar_write64(struct nvmet_pci_epf_ctrl * ctrl,u32 off,u64 val)250 static inline void nvmet_pci_epf_bar_write64(struct nvmet_pci_epf_ctrl *ctrl,
251 u32 off, u64 val)
252 {
253 nvmet_pci_epf_bar_write32(ctrl, off, val & 0xFFFFFFFF);
254 nvmet_pci_epf_bar_write32(ctrl, off + 4, (val >> 32) & 0xFFFFFFFF);
255 }
256
nvmet_pci_epf_mem_map(struct nvmet_pci_epf * nvme_epf,u64 pci_addr,size_t size,struct pci_epc_map * map)257 static inline int nvmet_pci_epf_mem_map(struct nvmet_pci_epf *nvme_epf,
258 u64 pci_addr, size_t size, struct pci_epc_map *map)
259 {
260 struct pci_epf *epf = nvme_epf->epf;
261
262 return pci_epc_mem_map(epf->epc, epf->func_no, epf->vfunc_no,
263 pci_addr, size, map);
264 }
265
nvmet_pci_epf_mem_unmap(struct nvmet_pci_epf * nvme_epf,struct pci_epc_map * map)266 static inline void nvmet_pci_epf_mem_unmap(struct nvmet_pci_epf *nvme_epf,
267 struct pci_epc_map *map)
268 {
269 struct pci_epf *epf = nvme_epf->epf;
270
271 pci_epc_mem_unmap(epf->epc, epf->func_no, epf->vfunc_no, map);
272 }
273
274 struct nvmet_pci_epf_dma_filter {
275 struct device *dev;
276 u32 dma_mask;
277 };
278
nvmet_pci_epf_dma_filter(struct dma_chan * chan,void * arg)279 static bool nvmet_pci_epf_dma_filter(struct dma_chan *chan, void *arg)
280 {
281 struct nvmet_pci_epf_dma_filter *filter = arg;
282 struct dma_slave_caps caps;
283
284 memset(&caps, 0, sizeof(caps));
285 dma_get_slave_caps(chan, &caps);
286
287 return chan->device->dev == filter->dev &&
288 (filter->dma_mask & caps.directions);
289 }
290
nvmet_pci_epf_init_dma(struct nvmet_pci_epf * nvme_epf)291 static void nvmet_pci_epf_init_dma(struct nvmet_pci_epf *nvme_epf)
292 {
293 struct pci_epf *epf = nvme_epf->epf;
294 struct device *dev = &epf->dev;
295 struct nvmet_pci_epf_dma_filter filter;
296 struct dma_chan *chan;
297 dma_cap_mask_t mask;
298
299 mutex_init(&nvme_epf->dma_rx_lock);
300 mutex_init(&nvme_epf->dma_tx_lock);
301
302 dma_cap_zero(mask);
303 dma_cap_set(DMA_SLAVE, mask);
304
305 filter.dev = epf->epc->dev.parent;
306 filter.dma_mask = BIT(DMA_DEV_TO_MEM);
307
308 chan = dma_request_channel(mask, nvmet_pci_epf_dma_filter, &filter);
309 if (!chan)
310 goto out_dma_no_rx;
311
312 nvme_epf->dma_rx_chan = chan;
313
314 filter.dma_mask = BIT(DMA_MEM_TO_DEV);
315 chan = dma_request_channel(mask, nvmet_pci_epf_dma_filter, &filter);
316 if (!chan)
317 goto out_dma_no_tx;
318
319 nvme_epf->dma_tx_chan = chan;
320
321 nvme_epf->dma_enabled = true;
322
323 dev_dbg(dev, "Using DMA RX channel %s, maximum segment size %u B\n",
324 dma_chan_name(chan),
325 dma_get_max_seg_size(dmaengine_get_dma_device(chan)));
326
327 dev_dbg(dev, "Using DMA TX channel %s, maximum segment size %u B\n",
328 dma_chan_name(chan),
329 dma_get_max_seg_size(dmaengine_get_dma_device(chan)));
330
331 return;
332
333 out_dma_no_tx:
334 dma_release_channel(nvme_epf->dma_rx_chan);
335 nvme_epf->dma_rx_chan = NULL;
336
337 out_dma_no_rx:
338 mutex_destroy(&nvme_epf->dma_rx_lock);
339 mutex_destroy(&nvme_epf->dma_tx_lock);
340 nvme_epf->dma_enabled = false;
341
342 dev_info(&epf->dev, "DMA not supported, falling back to MMIO\n");
343 }
344
nvmet_pci_epf_deinit_dma(struct nvmet_pci_epf * nvme_epf)345 static void nvmet_pci_epf_deinit_dma(struct nvmet_pci_epf *nvme_epf)
346 {
347 if (!nvme_epf->dma_enabled)
348 return;
349
350 dma_release_channel(nvme_epf->dma_tx_chan);
351 nvme_epf->dma_tx_chan = NULL;
352 dma_release_channel(nvme_epf->dma_rx_chan);
353 nvme_epf->dma_rx_chan = NULL;
354 mutex_destroy(&nvme_epf->dma_rx_lock);
355 mutex_destroy(&nvme_epf->dma_tx_lock);
356 nvme_epf->dma_enabled = false;
357 }
358
nvmet_pci_epf_dma_transfer(struct nvmet_pci_epf * nvme_epf,struct nvmet_pci_epf_segment * seg,enum dma_data_direction dir)359 static int nvmet_pci_epf_dma_transfer(struct nvmet_pci_epf *nvme_epf,
360 struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir)
361 {
362 struct pci_epf *epf = nvme_epf->epf;
363 struct dma_async_tx_descriptor *desc;
364 struct dma_slave_config sconf = {};
365 struct device *dev = &epf->dev;
366 struct device *dma_dev;
367 struct dma_chan *chan;
368 dma_cookie_t cookie;
369 dma_addr_t dma_addr;
370 struct mutex *lock;
371 int ret;
372
373 switch (dir) {
374 case DMA_FROM_DEVICE:
375 lock = &nvme_epf->dma_rx_lock;
376 chan = nvme_epf->dma_rx_chan;
377 sconf.direction = DMA_DEV_TO_MEM;
378 sconf.src_addr = seg->pci_addr;
379 break;
380 case DMA_TO_DEVICE:
381 lock = &nvme_epf->dma_tx_lock;
382 chan = nvme_epf->dma_tx_chan;
383 sconf.direction = DMA_MEM_TO_DEV;
384 sconf.dst_addr = seg->pci_addr;
385 break;
386 default:
387 return -EINVAL;
388 }
389
390 mutex_lock(lock);
391
392 dma_dev = dmaengine_get_dma_device(chan);
393 dma_addr = dma_map_single(dma_dev, seg->buf, seg->length, dir);
394 ret = dma_mapping_error(dma_dev, dma_addr);
395 if (ret)
396 goto unlock;
397
398 ret = dmaengine_slave_config(chan, &sconf);
399 if (ret) {
400 dev_err(dev, "Failed to configure DMA channel\n");
401 goto unmap;
402 }
403
404 desc = dmaengine_prep_slave_single(chan, dma_addr, seg->length,
405 sconf.direction, DMA_CTRL_ACK);
406 if (!desc) {
407 dev_err(dev, "Failed to prepare DMA\n");
408 ret = -EIO;
409 goto unmap;
410 }
411
412 cookie = dmaengine_submit(desc);
413 ret = dma_submit_error(cookie);
414 if (ret) {
415 dev_err(dev, "Failed to do DMA submit (err=%d)\n", ret);
416 goto unmap;
417 }
418
419 if (dma_sync_wait(chan, cookie) != DMA_COMPLETE) {
420 dev_err(dev, "DMA transfer failed\n");
421 ret = -EIO;
422 }
423
424 dmaengine_terminate_sync(chan);
425
426 unmap:
427 dma_unmap_single(dma_dev, dma_addr, seg->length, dir);
428
429 unlock:
430 mutex_unlock(lock);
431
432 return ret;
433 }
434
nvmet_pci_epf_mmio_transfer(struct nvmet_pci_epf * nvme_epf,struct nvmet_pci_epf_segment * seg,enum dma_data_direction dir)435 static int nvmet_pci_epf_mmio_transfer(struct nvmet_pci_epf *nvme_epf,
436 struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir)
437 {
438 u64 pci_addr = seg->pci_addr;
439 u32 length = seg->length;
440 void *buf = seg->buf;
441 struct pci_epc_map map;
442 int ret = -EINVAL;
443
444 /*
445 * Note: MMIO transfers do not need serialization but this is a
446 * simple way to avoid using too many mapping windows.
447 */
448 mutex_lock(&nvme_epf->mmio_lock);
449
450 while (length) {
451 ret = nvmet_pci_epf_mem_map(nvme_epf, pci_addr, length, &map);
452 if (ret)
453 break;
454
455 switch (dir) {
456 case DMA_FROM_DEVICE:
457 memcpy_fromio(buf, map.virt_addr, map.pci_size);
458 break;
459 case DMA_TO_DEVICE:
460 memcpy_toio(map.virt_addr, buf, map.pci_size);
461 break;
462 default:
463 ret = -EINVAL;
464 goto unlock;
465 }
466
467 pci_addr += map.pci_size;
468 buf += map.pci_size;
469 length -= map.pci_size;
470
471 nvmet_pci_epf_mem_unmap(nvme_epf, &map);
472 }
473
474 unlock:
475 mutex_unlock(&nvme_epf->mmio_lock);
476
477 return ret;
478 }
479
nvmet_pci_epf_transfer_seg(struct nvmet_pci_epf * nvme_epf,struct nvmet_pci_epf_segment * seg,enum dma_data_direction dir)480 static inline int nvmet_pci_epf_transfer_seg(struct nvmet_pci_epf *nvme_epf,
481 struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir)
482 {
483 if (nvme_epf->dma_enabled)
484 return nvmet_pci_epf_dma_transfer(nvme_epf, seg, dir);
485
486 return nvmet_pci_epf_mmio_transfer(nvme_epf, seg, dir);
487 }
488
nvmet_pci_epf_transfer(struct nvmet_pci_epf_ctrl * ctrl,void * buf,u64 pci_addr,u32 length,enum dma_data_direction dir)489 static inline int nvmet_pci_epf_transfer(struct nvmet_pci_epf_ctrl *ctrl,
490 void *buf, u64 pci_addr, u32 length,
491 enum dma_data_direction dir)
492 {
493 struct nvmet_pci_epf_segment seg = {
494 .buf = buf,
495 .pci_addr = pci_addr,
496 .length = length,
497 };
498
499 return nvmet_pci_epf_transfer_seg(ctrl->nvme_epf, &seg, dir);
500 }
501
nvmet_pci_epf_alloc_irq_vectors(struct nvmet_pci_epf_ctrl * ctrl)502 static int nvmet_pci_epf_alloc_irq_vectors(struct nvmet_pci_epf_ctrl *ctrl)
503 {
504 ctrl->irq_vectors = kcalloc(ctrl->nr_queues,
505 sizeof(struct nvmet_pci_epf_irq_vector),
506 GFP_KERNEL);
507 if (!ctrl->irq_vectors)
508 return -ENOMEM;
509
510 mutex_init(&ctrl->irq_lock);
511
512 return 0;
513 }
514
nvmet_pci_epf_free_irq_vectors(struct nvmet_pci_epf_ctrl * ctrl)515 static void nvmet_pci_epf_free_irq_vectors(struct nvmet_pci_epf_ctrl *ctrl)
516 {
517 if (ctrl->irq_vectors) {
518 mutex_destroy(&ctrl->irq_lock);
519 kfree(ctrl->irq_vectors);
520 ctrl->irq_vectors = NULL;
521 }
522 }
523
524 static struct nvmet_pci_epf_irq_vector *
nvmet_pci_epf_find_irq_vector(struct nvmet_pci_epf_ctrl * ctrl,u16 vector)525 nvmet_pci_epf_find_irq_vector(struct nvmet_pci_epf_ctrl *ctrl, u16 vector)
526 {
527 struct nvmet_pci_epf_irq_vector *iv;
528 int i;
529
530 lockdep_assert_held(&ctrl->irq_lock);
531
532 for (i = 0; i < ctrl->nr_queues; i++) {
533 iv = &ctrl->irq_vectors[i];
534 if (iv->ref && iv->vector == vector)
535 return iv;
536 }
537
538 return NULL;
539 }
540
541 static struct nvmet_pci_epf_irq_vector *
nvmet_pci_epf_add_irq_vector(struct nvmet_pci_epf_ctrl * ctrl,u16 vector)542 nvmet_pci_epf_add_irq_vector(struct nvmet_pci_epf_ctrl *ctrl, u16 vector)
543 {
544 struct nvmet_pci_epf_irq_vector *iv;
545 int i;
546
547 mutex_lock(&ctrl->irq_lock);
548
549 iv = nvmet_pci_epf_find_irq_vector(ctrl, vector);
550 if (iv) {
551 iv->ref++;
552 goto unlock;
553 }
554
555 for (i = 0; i < ctrl->nr_queues; i++) {
556 iv = &ctrl->irq_vectors[i];
557 if (!iv->ref)
558 break;
559 }
560
561 if (WARN_ON_ONCE(!iv))
562 goto unlock;
563
564 iv->ref = 1;
565 iv->vector = vector;
566 iv->nr_irqs = 0;
567
568 unlock:
569 mutex_unlock(&ctrl->irq_lock);
570
571 return iv;
572 }
573
nvmet_pci_epf_remove_irq_vector(struct nvmet_pci_epf_ctrl * ctrl,u16 vector)574 static void nvmet_pci_epf_remove_irq_vector(struct nvmet_pci_epf_ctrl *ctrl,
575 u16 vector)
576 {
577 struct nvmet_pci_epf_irq_vector *iv;
578
579 mutex_lock(&ctrl->irq_lock);
580
581 iv = nvmet_pci_epf_find_irq_vector(ctrl, vector);
582 if (iv) {
583 iv->ref--;
584 if (!iv->ref) {
585 iv->vector = 0;
586 iv->nr_irqs = 0;
587 }
588 }
589
590 mutex_unlock(&ctrl->irq_lock);
591 }
592
nvmet_pci_epf_should_raise_irq(struct nvmet_pci_epf_ctrl * ctrl,struct nvmet_pci_epf_queue * cq,bool force)593 static bool nvmet_pci_epf_should_raise_irq(struct nvmet_pci_epf_ctrl *ctrl,
594 struct nvmet_pci_epf_queue *cq, bool force)
595 {
596 struct nvmet_pci_epf_irq_vector *iv = cq->iv;
597 bool ret;
598
599 if (!test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
600 return false;
601
602 /* IRQ coalescing for the admin queue is not allowed. */
603 if (!cq->qid)
604 return true;
605
606 if (iv->cd)
607 return true;
608
609 if (force) {
610 ret = iv->nr_irqs > 0;
611 } else {
612 iv->nr_irqs++;
613 ret = iv->nr_irqs >= ctrl->irq_vector_threshold;
614 }
615 if (ret)
616 iv->nr_irqs = 0;
617
618 return ret;
619 }
620
nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl * ctrl,struct nvmet_pci_epf_queue * cq,bool force)621 static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl,
622 struct nvmet_pci_epf_queue *cq, bool force)
623 {
624 struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf;
625 struct pci_epf *epf = nvme_epf->epf;
626 int ret = 0;
627
628 if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
629 return;
630
631 mutex_lock(&ctrl->irq_lock);
632
633 if (!nvmet_pci_epf_should_raise_irq(ctrl, cq, force))
634 goto unlock;
635
636 switch (nvme_epf->irq_type) {
637 case PCI_IRQ_MSIX:
638 case PCI_IRQ_MSI:
639 ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no,
640 nvme_epf->irq_type, cq->vector + 1);
641 if (!ret)
642 break;
643 /*
644 * If we got an error, it is likely because the host is using
645 * legacy IRQs (e.g. BIOS, grub).
646 */
647 fallthrough;
648 case PCI_IRQ_INTX:
649 ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no,
650 PCI_IRQ_INTX, 0);
651 break;
652 default:
653 WARN_ON_ONCE(1);
654 ret = -EINVAL;
655 break;
656 }
657
658 if (ret)
659 dev_err(ctrl->dev, "Failed to raise IRQ (err=%d)\n", ret);
660
661 unlock:
662 mutex_unlock(&ctrl->irq_lock);
663 }
664
nvmet_pci_epf_iod_name(struct nvmet_pci_epf_iod * iod)665 static inline const char *nvmet_pci_epf_iod_name(struct nvmet_pci_epf_iod *iod)
666 {
667 return nvme_opcode_str(iod->sq->qid, iod->cmd.common.opcode);
668 }
669
670 static void nvmet_pci_epf_exec_iod_work(struct work_struct *work);
671
672 static struct nvmet_pci_epf_iod *
nvmet_pci_epf_alloc_iod(struct nvmet_pci_epf_queue * sq)673 nvmet_pci_epf_alloc_iod(struct nvmet_pci_epf_queue *sq)
674 {
675 struct nvmet_pci_epf_ctrl *ctrl = sq->ctrl;
676 struct nvmet_pci_epf_iod *iod;
677
678 iod = mempool_alloc(&ctrl->iod_pool, GFP_KERNEL);
679 if (unlikely(!iod))
680 return NULL;
681
682 memset(iod, 0, sizeof(*iod));
683 iod->req.cmd = &iod->cmd;
684 iod->req.cqe = &iod->cqe;
685 iod->req.port = ctrl->port;
686 iod->ctrl = ctrl;
687 iod->sq = sq;
688 iod->cq = &ctrl->cq[sq->qid];
689 INIT_LIST_HEAD(&iod->link);
690 iod->dma_dir = DMA_NONE;
691 INIT_WORK(&iod->work, nvmet_pci_epf_exec_iod_work);
692 init_completion(&iod->done);
693
694 return iod;
695 }
696
697 /*
698 * Allocate or grow a command table of PCI segments.
699 */
nvmet_pci_epf_alloc_iod_data_segs(struct nvmet_pci_epf_iod * iod,int nsegs)700 static int nvmet_pci_epf_alloc_iod_data_segs(struct nvmet_pci_epf_iod *iod,
701 int nsegs)
702 {
703 struct nvmet_pci_epf_segment *segs;
704 int nr_segs = iod->nr_data_segs + nsegs;
705
706 segs = krealloc(iod->data_segs,
707 nr_segs * sizeof(struct nvmet_pci_epf_segment),
708 GFP_KERNEL | __GFP_ZERO);
709 if (!segs)
710 return -ENOMEM;
711
712 iod->nr_data_segs = nr_segs;
713 iod->data_segs = segs;
714
715 return 0;
716 }
717
nvmet_pci_epf_free_iod(struct nvmet_pci_epf_iod * iod)718 static void nvmet_pci_epf_free_iod(struct nvmet_pci_epf_iod *iod)
719 {
720 int i;
721
722 if (iod->data_segs) {
723 for (i = 0; i < iod->nr_data_segs; i++)
724 kfree(iod->data_segs[i].buf);
725 if (iod->data_segs != &iod->data_seg)
726 kfree(iod->data_segs);
727 }
728 if (iod->data_sgt.nents > 1)
729 sg_free_table(&iod->data_sgt);
730 mempool_free(iod, &iod->ctrl->iod_pool);
731 }
732
nvmet_pci_epf_transfer_iod_data(struct nvmet_pci_epf_iod * iod)733 static int nvmet_pci_epf_transfer_iod_data(struct nvmet_pci_epf_iod *iod)
734 {
735 struct nvmet_pci_epf *nvme_epf = iod->ctrl->nvme_epf;
736 struct nvmet_pci_epf_segment *seg = &iod->data_segs[0];
737 int i, ret;
738
739 /* Split the data transfer according to the PCI segments. */
740 for (i = 0; i < iod->nr_data_segs; i++, seg++) {
741 ret = nvmet_pci_epf_transfer_seg(nvme_epf, seg, iod->dma_dir);
742 if (ret) {
743 iod->status = NVME_SC_DATA_XFER_ERROR | NVME_STATUS_DNR;
744 return ret;
745 }
746 }
747
748 return 0;
749 }
750
nvmet_pci_epf_prp_ofst(struct nvmet_pci_epf_ctrl * ctrl,u64 prp)751 static inline u32 nvmet_pci_epf_prp_ofst(struct nvmet_pci_epf_ctrl *ctrl,
752 u64 prp)
753 {
754 return prp & ctrl->mps_mask;
755 }
756
nvmet_pci_epf_prp_size(struct nvmet_pci_epf_ctrl * ctrl,u64 prp)757 static inline size_t nvmet_pci_epf_prp_size(struct nvmet_pci_epf_ctrl *ctrl,
758 u64 prp)
759 {
760 return ctrl->mps - nvmet_pci_epf_prp_ofst(ctrl, prp);
761 }
762
763 /*
764 * Transfer a PRP list from the host and return the number of prps.
765 */
nvmet_pci_epf_get_prp_list(struct nvmet_pci_epf_ctrl * ctrl,u64 prp,size_t xfer_len,__le64 * prps)766 static int nvmet_pci_epf_get_prp_list(struct nvmet_pci_epf_ctrl *ctrl, u64 prp,
767 size_t xfer_len, __le64 *prps)
768 {
769 size_t nr_prps = (xfer_len + ctrl->mps_mask) >> ctrl->mps_shift;
770 u32 length;
771 int ret;
772
773 /*
774 * Compute the number of PRPs required for the number of bytes to
775 * transfer (xfer_len). If this number overflows the memory page size
776 * with the PRP list pointer specified, only return the space available
777 * in the memory page, the last PRP in there will be a PRP list pointer
778 * to the remaining PRPs.
779 */
780 length = min(nvmet_pci_epf_prp_size(ctrl, prp), nr_prps << 3);
781 ret = nvmet_pci_epf_transfer(ctrl, prps, prp, length, DMA_FROM_DEVICE);
782 if (ret)
783 return ret;
784
785 return length >> 3;
786 }
787
nvmet_pci_epf_iod_parse_prp_list(struct nvmet_pci_epf_ctrl * ctrl,struct nvmet_pci_epf_iod * iod)788 static int nvmet_pci_epf_iod_parse_prp_list(struct nvmet_pci_epf_ctrl *ctrl,
789 struct nvmet_pci_epf_iod *iod)
790 {
791 struct nvme_command *cmd = &iod->cmd;
792 struct nvmet_pci_epf_segment *seg;
793 size_t size = 0, ofst, prp_size, xfer_len;
794 size_t transfer_len = iod->data_len;
795 int nr_segs, nr_prps = 0;
796 u64 pci_addr, prp;
797 int i = 0, ret;
798 __le64 *prps;
799
800 prps = kzalloc(ctrl->mps, GFP_KERNEL);
801 if (!prps)
802 goto err_internal;
803
804 /*
805 * Allocate PCI segments for the command: this considers the worst case
806 * scenario where all prps are discontiguous, so get as many segments
807 * as we can have prps. In practice, most of the time, we will have
808 * far less PCI segments than prps.
809 */
810 prp = le64_to_cpu(cmd->common.dptr.prp1);
811 if (!prp)
812 goto err_invalid_field;
813
814 ofst = nvmet_pci_epf_prp_ofst(ctrl, prp);
815 nr_segs = (transfer_len + ofst + ctrl->mps - 1) >> ctrl->mps_shift;
816
817 ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_segs);
818 if (ret)
819 goto err_internal;
820
821 /* Set the first segment using prp1. */
822 seg = &iod->data_segs[0];
823 seg->pci_addr = prp;
824 seg->length = nvmet_pci_epf_prp_size(ctrl, prp);
825
826 size = seg->length;
827 pci_addr = prp + size;
828 nr_segs = 1;
829
830 /*
831 * Now build the PCI address segments using the PRP lists, starting
832 * from prp2.
833 */
834 prp = le64_to_cpu(cmd->common.dptr.prp2);
835 if (!prp)
836 goto err_invalid_field;
837
838 while (size < transfer_len) {
839 xfer_len = transfer_len - size;
840
841 if (!nr_prps) {
842 nr_prps = nvmet_pci_epf_get_prp_list(ctrl, prp,
843 xfer_len, prps);
844 if (nr_prps < 0)
845 goto err_internal;
846
847 i = 0;
848 ofst = 0;
849 }
850
851 /* Current entry */
852 prp = le64_to_cpu(prps[i]);
853 if (!prp)
854 goto err_invalid_field;
855
856 /* Did we reach the last PRP entry of the list? */
857 if (xfer_len > ctrl->mps && i == nr_prps - 1) {
858 /* We need more PRPs: PRP is a list pointer. */
859 nr_prps = 0;
860 continue;
861 }
862
863 /* Only the first PRP is allowed to have an offset. */
864 if (nvmet_pci_epf_prp_ofst(ctrl, prp))
865 goto err_invalid_offset;
866
867 if (prp != pci_addr) {
868 /* Discontiguous prp: new segment. */
869 nr_segs++;
870 if (WARN_ON_ONCE(nr_segs > iod->nr_data_segs))
871 goto err_internal;
872
873 seg++;
874 seg->pci_addr = prp;
875 seg->length = 0;
876 pci_addr = prp;
877 }
878
879 prp_size = min_t(size_t, ctrl->mps, xfer_len);
880 seg->length += prp_size;
881 pci_addr += prp_size;
882 size += prp_size;
883
884 i++;
885 }
886
887 iod->nr_data_segs = nr_segs;
888 ret = 0;
889
890 if (size != transfer_len) {
891 dev_err(ctrl->dev,
892 "PRPs transfer length mismatch: got %zu B, need %zu B\n",
893 size, transfer_len);
894 goto err_internal;
895 }
896
897 kfree(prps);
898
899 return 0;
900
901 err_invalid_offset:
902 dev_err(ctrl->dev, "PRPs list invalid offset\n");
903 iod->status = NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR;
904 goto err;
905
906 err_invalid_field:
907 dev_err(ctrl->dev, "PRPs list invalid field\n");
908 iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
909 goto err;
910
911 err_internal:
912 dev_err(ctrl->dev, "PRPs list internal error\n");
913 iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
914
915 err:
916 kfree(prps);
917 return -EINVAL;
918 }
919
nvmet_pci_epf_iod_parse_prp_simple(struct nvmet_pci_epf_ctrl * ctrl,struct nvmet_pci_epf_iod * iod)920 static int nvmet_pci_epf_iod_parse_prp_simple(struct nvmet_pci_epf_ctrl *ctrl,
921 struct nvmet_pci_epf_iod *iod)
922 {
923 struct nvme_command *cmd = &iod->cmd;
924 size_t transfer_len = iod->data_len;
925 int ret, nr_segs = 1;
926 u64 prp1, prp2 = 0;
927 size_t prp1_size;
928
929 prp1 = le64_to_cpu(cmd->common.dptr.prp1);
930 prp1_size = nvmet_pci_epf_prp_size(ctrl, prp1);
931
932 /* For commands crossing a page boundary, we should have prp2. */
933 if (transfer_len > prp1_size) {
934 prp2 = le64_to_cpu(cmd->common.dptr.prp2);
935 if (!prp2) {
936 iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
937 return -EINVAL;
938 }
939 if (nvmet_pci_epf_prp_ofst(ctrl, prp2)) {
940 iod->status =
941 NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR;
942 return -EINVAL;
943 }
944 if (prp2 != prp1 + prp1_size)
945 nr_segs = 2;
946 }
947
948 if (nr_segs == 1) {
949 iod->nr_data_segs = 1;
950 iod->data_segs = &iod->data_seg;
951 iod->data_segs[0].pci_addr = prp1;
952 iod->data_segs[0].length = transfer_len;
953 return 0;
954 }
955
956 ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_segs);
957 if (ret) {
958 iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
959 return ret;
960 }
961
962 iod->data_segs[0].pci_addr = prp1;
963 iod->data_segs[0].length = prp1_size;
964 iod->data_segs[1].pci_addr = prp2;
965 iod->data_segs[1].length = transfer_len - prp1_size;
966
967 return 0;
968 }
969
nvmet_pci_epf_iod_parse_prps(struct nvmet_pci_epf_iod * iod)970 static int nvmet_pci_epf_iod_parse_prps(struct nvmet_pci_epf_iod *iod)
971 {
972 struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl;
973 u64 prp1 = le64_to_cpu(iod->cmd.common.dptr.prp1);
974 size_t ofst;
975
976 /* Get the PCI address segments for the command using its PRPs. */
977 ofst = nvmet_pci_epf_prp_ofst(ctrl, prp1);
978 if (ofst & 0x3) {
979 iod->status = NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR;
980 return -EINVAL;
981 }
982
983 if (iod->data_len + ofst <= ctrl->mps * 2)
984 return nvmet_pci_epf_iod_parse_prp_simple(ctrl, iod);
985
986 return nvmet_pci_epf_iod_parse_prp_list(ctrl, iod);
987 }
988
989 /*
990 * Transfer an SGL segment from the host and return the number of data
991 * descriptors and the next segment descriptor, if any.
992 */
993 static struct nvme_sgl_desc *
nvmet_pci_epf_get_sgl_segment(struct nvmet_pci_epf_ctrl * ctrl,struct nvme_sgl_desc * desc,unsigned int * nr_sgls)994 nvmet_pci_epf_get_sgl_segment(struct nvmet_pci_epf_ctrl *ctrl,
995 struct nvme_sgl_desc *desc, unsigned int *nr_sgls)
996 {
997 struct nvme_sgl_desc *sgls;
998 u32 length = le32_to_cpu(desc->length);
999 int nr_descs, ret;
1000 void *buf;
1001
1002 buf = kmalloc(length, GFP_KERNEL);
1003 if (!buf)
1004 return NULL;
1005
1006 ret = nvmet_pci_epf_transfer(ctrl, buf, le64_to_cpu(desc->addr), length,
1007 DMA_FROM_DEVICE);
1008 if (ret) {
1009 kfree(buf);
1010 return NULL;
1011 }
1012
1013 sgls = buf;
1014 nr_descs = length / sizeof(struct nvme_sgl_desc);
1015 if (sgls[nr_descs - 1].type == (NVME_SGL_FMT_SEG_DESC << 4) ||
1016 sgls[nr_descs - 1].type == (NVME_SGL_FMT_LAST_SEG_DESC << 4)) {
1017 /*
1018 * We have another SGL segment following this one: do not count
1019 * it as a regular data SGL descriptor and return it to the
1020 * caller.
1021 */
1022 *desc = sgls[nr_descs - 1];
1023 nr_descs--;
1024 } else {
1025 /* We do not have another SGL segment after this one. */
1026 desc->length = 0;
1027 }
1028
1029 *nr_sgls = nr_descs;
1030
1031 return sgls;
1032 }
1033
nvmet_pci_epf_iod_parse_sgl_segments(struct nvmet_pci_epf_ctrl * ctrl,struct nvmet_pci_epf_iod * iod)1034 static int nvmet_pci_epf_iod_parse_sgl_segments(struct nvmet_pci_epf_ctrl *ctrl,
1035 struct nvmet_pci_epf_iod *iod)
1036 {
1037 struct nvme_command *cmd = &iod->cmd;
1038 struct nvme_sgl_desc seg = cmd->common.dptr.sgl;
1039 struct nvme_sgl_desc *sgls = NULL;
1040 int n = 0, i, nr_sgls;
1041 int ret;
1042
1043 /*
1044 * We do not support inline data nor keyed SGLs, so we should be seeing
1045 * only segment descriptors.
1046 */
1047 if (seg.type != (NVME_SGL_FMT_SEG_DESC << 4) &&
1048 seg.type != (NVME_SGL_FMT_LAST_SEG_DESC << 4)) {
1049 iod->status = NVME_SC_SGL_INVALID_TYPE | NVME_STATUS_DNR;
1050 return -EIO;
1051 }
1052
1053 while (seg.length) {
1054 sgls = nvmet_pci_epf_get_sgl_segment(ctrl, &seg, &nr_sgls);
1055 if (!sgls) {
1056 iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
1057 return -EIO;
1058 }
1059
1060 /* Grow the PCI segment table as needed. */
1061 ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_sgls);
1062 if (ret) {
1063 iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
1064 goto out;
1065 }
1066
1067 /*
1068 * Parse the SGL descriptors to build the PCI segment table,
1069 * checking the descriptor type as we go.
1070 */
1071 for (i = 0; i < nr_sgls; i++) {
1072 if (sgls[i].type != (NVME_SGL_FMT_DATA_DESC << 4)) {
1073 iod->status = NVME_SC_SGL_INVALID_TYPE |
1074 NVME_STATUS_DNR;
1075 goto out;
1076 }
1077 iod->data_segs[n].pci_addr = le64_to_cpu(sgls[i].addr);
1078 iod->data_segs[n].length = le32_to_cpu(sgls[i].length);
1079 n++;
1080 }
1081
1082 kfree(sgls);
1083 }
1084
1085 out:
1086 if (iod->status != NVME_SC_SUCCESS) {
1087 kfree(sgls);
1088 return -EIO;
1089 }
1090
1091 return 0;
1092 }
1093
nvmet_pci_epf_iod_parse_sgls(struct nvmet_pci_epf_iod * iod)1094 static int nvmet_pci_epf_iod_parse_sgls(struct nvmet_pci_epf_iod *iod)
1095 {
1096 struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl;
1097 struct nvme_sgl_desc *sgl = &iod->cmd.common.dptr.sgl;
1098
1099 if (sgl->type == (NVME_SGL_FMT_DATA_DESC << 4)) {
1100 /* Single data descriptor case. */
1101 iod->nr_data_segs = 1;
1102 iod->data_segs = &iod->data_seg;
1103 iod->data_seg.pci_addr = le64_to_cpu(sgl->addr);
1104 iod->data_seg.length = le32_to_cpu(sgl->length);
1105 return 0;
1106 }
1107
1108 return nvmet_pci_epf_iod_parse_sgl_segments(ctrl, iod);
1109 }
1110
nvmet_pci_epf_alloc_iod_data_buf(struct nvmet_pci_epf_iod * iod)1111 static int nvmet_pci_epf_alloc_iod_data_buf(struct nvmet_pci_epf_iod *iod)
1112 {
1113 struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl;
1114 struct nvmet_req *req = &iod->req;
1115 struct nvmet_pci_epf_segment *seg;
1116 struct scatterlist *sg;
1117 int ret, i;
1118
1119 if (iod->data_len > ctrl->mdts) {
1120 iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
1121 return -EINVAL;
1122 }
1123
1124 /*
1125 * Get the PCI address segments for the command data buffer using either
1126 * its SGLs or PRPs.
1127 */
1128 if (iod->cmd.common.flags & NVME_CMD_SGL_ALL)
1129 ret = nvmet_pci_epf_iod_parse_sgls(iod);
1130 else
1131 ret = nvmet_pci_epf_iod_parse_prps(iod);
1132 if (ret)
1133 return ret;
1134
1135 /* Get a command buffer using SGLs matching the PCI segments. */
1136 if (iod->nr_data_segs == 1) {
1137 sg_init_table(&iod->data_sgl, 1);
1138 iod->data_sgt.sgl = &iod->data_sgl;
1139 iod->data_sgt.nents = 1;
1140 iod->data_sgt.orig_nents = 1;
1141 } else {
1142 ret = sg_alloc_table(&iod->data_sgt, iod->nr_data_segs,
1143 GFP_KERNEL);
1144 if (ret)
1145 goto err_nomem;
1146 }
1147
1148 for_each_sgtable_sg(&iod->data_sgt, sg, i) {
1149 seg = &iod->data_segs[i];
1150 seg->buf = kmalloc(seg->length, GFP_KERNEL);
1151 if (!seg->buf)
1152 goto err_nomem;
1153 sg_set_buf(sg, seg->buf, seg->length);
1154 }
1155
1156 req->transfer_len = iod->data_len;
1157 req->sg = iod->data_sgt.sgl;
1158 req->sg_cnt = iod->data_sgt.nents;
1159
1160 return 0;
1161
1162 err_nomem:
1163 iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
1164 return -ENOMEM;
1165 }
1166
nvmet_pci_epf_complete_iod(struct nvmet_pci_epf_iod * iod)1167 static void nvmet_pci_epf_complete_iod(struct nvmet_pci_epf_iod *iod)
1168 {
1169 struct nvmet_pci_epf_queue *cq = iod->cq;
1170 unsigned long flags;
1171
1172 /* Print an error message for failed commands, except AENs. */
1173 iod->status = le16_to_cpu(iod->cqe.status) >> 1;
1174 if (iod->status && iod->cmd.common.opcode != nvme_admin_async_event)
1175 dev_err(iod->ctrl->dev,
1176 "CQ[%d]: Command %s (0x%x) status 0x%0x\n",
1177 iod->sq->qid, nvmet_pci_epf_iod_name(iod),
1178 iod->cmd.common.opcode, iod->status);
1179
1180 /*
1181 * Add the command to the list of completed commands and schedule the
1182 * CQ work.
1183 */
1184 spin_lock_irqsave(&cq->lock, flags);
1185 list_add_tail(&iod->link, &cq->list);
1186 queue_delayed_work(system_highpri_wq, &cq->work, 0);
1187 spin_unlock_irqrestore(&cq->lock, flags);
1188 }
1189
nvmet_pci_epf_drain_queue(struct nvmet_pci_epf_queue * queue)1190 static void nvmet_pci_epf_drain_queue(struct nvmet_pci_epf_queue *queue)
1191 {
1192 struct nvmet_pci_epf_iod *iod;
1193 unsigned long flags;
1194
1195 spin_lock_irqsave(&queue->lock, flags);
1196 while (!list_empty(&queue->list)) {
1197 iod = list_first_entry(&queue->list, struct nvmet_pci_epf_iod,
1198 link);
1199 list_del_init(&iod->link);
1200 nvmet_pci_epf_free_iod(iod);
1201 }
1202 spin_unlock_irqrestore(&queue->lock, flags);
1203 }
1204
nvmet_pci_epf_add_port(struct nvmet_port * port)1205 static int nvmet_pci_epf_add_port(struct nvmet_port *port)
1206 {
1207 mutex_lock(&nvmet_pci_epf_ports_mutex);
1208 list_add_tail(&port->entry, &nvmet_pci_epf_ports);
1209 mutex_unlock(&nvmet_pci_epf_ports_mutex);
1210 return 0;
1211 }
1212
nvmet_pci_epf_remove_port(struct nvmet_port * port)1213 static void nvmet_pci_epf_remove_port(struct nvmet_port *port)
1214 {
1215 mutex_lock(&nvmet_pci_epf_ports_mutex);
1216 list_del_init(&port->entry);
1217 mutex_unlock(&nvmet_pci_epf_ports_mutex);
1218 }
1219
1220 static struct nvmet_port *
nvmet_pci_epf_find_port(struct nvmet_pci_epf_ctrl * ctrl,__le16 portid)1221 nvmet_pci_epf_find_port(struct nvmet_pci_epf_ctrl *ctrl, __le16 portid)
1222 {
1223 struct nvmet_port *p, *port = NULL;
1224
1225 mutex_lock(&nvmet_pci_epf_ports_mutex);
1226 list_for_each_entry(p, &nvmet_pci_epf_ports, entry) {
1227 if (p->disc_addr.portid == portid) {
1228 port = p;
1229 break;
1230 }
1231 }
1232 mutex_unlock(&nvmet_pci_epf_ports_mutex);
1233
1234 return port;
1235 }
1236
nvmet_pci_epf_queue_response(struct nvmet_req * req)1237 static void nvmet_pci_epf_queue_response(struct nvmet_req *req)
1238 {
1239 struct nvmet_pci_epf_iod *iod =
1240 container_of(req, struct nvmet_pci_epf_iod, req);
1241
1242 iod->status = le16_to_cpu(req->cqe->status) >> 1;
1243
1244 /* If we have no data to transfer, directly complete the command. */
1245 if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE) {
1246 nvmet_pci_epf_complete_iod(iod);
1247 return;
1248 }
1249
1250 complete(&iod->done);
1251 }
1252
nvmet_pci_epf_get_mdts(const struct nvmet_ctrl * tctrl)1253 static u8 nvmet_pci_epf_get_mdts(const struct nvmet_ctrl *tctrl)
1254 {
1255 struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1256 int page_shift = NVME_CAP_MPSMIN(tctrl->cap) + 12;
1257
1258 return ilog2(ctrl->mdts) - page_shift;
1259 }
1260
nvmet_pci_epf_create_cq(struct nvmet_ctrl * tctrl,u16 cqid,u16 flags,u16 qsize,u64 pci_addr,u16 vector)1261 static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl,
1262 u16 cqid, u16 flags, u16 qsize, u64 pci_addr, u16 vector)
1263 {
1264 struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1265 struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
1266 u16 status;
1267
1268 if (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
1269 return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
1270
1271 if (!(flags & NVME_QUEUE_PHYS_CONTIG))
1272 return NVME_SC_INVALID_QUEUE | NVME_STATUS_DNR;
1273
1274 cq->pci_addr = pci_addr;
1275 cq->qid = cqid;
1276 cq->depth = qsize + 1;
1277 cq->vector = vector;
1278 cq->head = 0;
1279 cq->tail = 0;
1280 cq->phase = 1;
1281 cq->db = NVME_REG_DBS + (((cqid * 2) + 1) * sizeof(u32));
1282 nvmet_pci_epf_bar_write32(ctrl, cq->db, 0);
1283
1284 if (!cqid)
1285 cq->qes = sizeof(struct nvme_completion);
1286 else
1287 cq->qes = ctrl->io_cqes;
1288 cq->pci_size = cq->qes * cq->depth;
1289
1290 if (flags & NVME_CQ_IRQ_ENABLED) {
1291 cq->iv = nvmet_pci_epf_add_irq_vector(ctrl, vector);
1292 if (!cq->iv)
1293 return NVME_SC_INTERNAL | NVME_STATUS_DNR;
1294 set_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags);
1295 }
1296
1297 status = nvmet_cq_create(tctrl, &cq->nvme_cq, cqid, cq->depth);
1298 if (status != NVME_SC_SUCCESS)
1299 goto err;
1300
1301 set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags);
1302
1303 dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n",
1304 cqid, qsize, cq->qes, cq->vector);
1305
1306 return NVME_SC_SUCCESS;
1307
1308 err:
1309 if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
1310 nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector);
1311 return status;
1312 }
1313
nvmet_pci_epf_delete_cq(struct nvmet_ctrl * tctrl,u16 cqid)1314 static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid)
1315 {
1316 struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1317 struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
1318
1319 if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
1320 return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
1321
1322 cancel_delayed_work_sync(&cq->work);
1323 nvmet_pci_epf_drain_queue(cq);
1324 nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector);
1325
1326 return NVME_SC_SUCCESS;
1327 }
1328
nvmet_pci_epf_create_sq(struct nvmet_ctrl * tctrl,u16 sqid,u16 flags,u16 qsize,u64 pci_addr)1329 static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
1330 u16 sqid, u16 flags, u16 qsize, u64 pci_addr)
1331 {
1332 struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1333 struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
1334 u16 status;
1335
1336 if (test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
1337 return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
1338
1339 if (!(flags & NVME_QUEUE_PHYS_CONTIG))
1340 return NVME_SC_INVALID_QUEUE | NVME_STATUS_DNR;
1341
1342 sq->pci_addr = pci_addr;
1343 sq->qid = sqid;
1344 sq->depth = qsize + 1;
1345 sq->head = 0;
1346 sq->tail = 0;
1347 sq->phase = 0;
1348 sq->db = NVME_REG_DBS + (sqid * 2 * sizeof(u32));
1349 nvmet_pci_epf_bar_write32(ctrl, sq->db, 0);
1350 if (!sqid)
1351 sq->qes = 1UL << NVME_ADM_SQES;
1352 else
1353 sq->qes = ctrl->io_sqes;
1354 sq->pci_size = sq->qes * sq->depth;
1355
1356 status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth);
1357 if (status != NVME_SC_SUCCESS)
1358 return status;
1359
1360 sq->iod_wq = alloc_workqueue("sq%d_wq", WQ_UNBOUND,
1361 min_t(int, sq->depth, WQ_MAX_ACTIVE), sqid);
1362 if (!sq->iod_wq) {
1363 dev_err(ctrl->dev, "Failed to create SQ %d work queue\n", sqid);
1364 status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
1365 goto out_destroy_sq;
1366 }
1367
1368 set_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags);
1369
1370 dev_dbg(ctrl->dev, "SQ[%u]: %u entries of %zu B\n",
1371 sqid, qsize, sq->qes);
1372
1373 return NVME_SC_SUCCESS;
1374
1375 out_destroy_sq:
1376 nvmet_sq_destroy(&sq->nvme_sq);
1377 return status;
1378 }
1379
nvmet_pci_epf_delete_sq(struct nvmet_ctrl * tctrl,u16 sqid)1380 static u16 nvmet_pci_epf_delete_sq(struct nvmet_ctrl *tctrl, u16 sqid)
1381 {
1382 struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1383 struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
1384
1385 if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
1386 return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
1387
1388 flush_workqueue(sq->iod_wq);
1389 destroy_workqueue(sq->iod_wq);
1390 sq->iod_wq = NULL;
1391
1392 nvmet_pci_epf_drain_queue(sq);
1393
1394 if (sq->nvme_sq.ctrl)
1395 nvmet_sq_destroy(&sq->nvme_sq);
1396
1397 return NVME_SC_SUCCESS;
1398 }
1399
nvmet_pci_epf_get_feat(const struct nvmet_ctrl * tctrl,u8 feat,void * data)1400 static u16 nvmet_pci_epf_get_feat(const struct nvmet_ctrl *tctrl,
1401 u8 feat, void *data)
1402 {
1403 struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1404 struct nvmet_feat_arbitration *arb;
1405 struct nvmet_feat_irq_coalesce *irqc;
1406 struct nvmet_feat_irq_config *irqcfg;
1407 struct nvmet_pci_epf_irq_vector *iv;
1408 u16 status;
1409
1410 switch (feat) {
1411 case NVME_FEAT_ARBITRATION:
1412 arb = data;
1413 if (!ctrl->sq_ab)
1414 arb->ab = 0x7;
1415 else
1416 arb->ab = ilog2(ctrl->sq_ab);
1417 return NVME_SC_SUCCESS;
1418
1419 case NVME_FEAT_IRQ_COALESCE:
1420 irqc = data;
1421 irqc->thr = ctrl->irq_vector_threshold;
1422 irqc->time = 0;
1423 return NVME_SC_SUCCESS;
1424
1425 case NVME_FEAT_IRQ_CONFIG:
1426 irqcfg = data;
1427 mutex_lock(&ctrl->irq_lock);
1428 iv = nvmet_pci_epf_find_irq_vector(ctrl, irqcfg->iv);
1429 if (iv) {
1430 irqcfg->cd = iv->cd;
1431 status = NVME_SC_SUCCESS;
1432 } else {
1433 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
1434 }
1435 mutex_unlock(&ctrl->irq_lock);
1436 return status;
1437
1438 default:
1439 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
1440 }
1441 }
1442
nvmet_pci_epf_set_feat(const struct nvmet_ctrl * tctrl,u8 feat,void * data)1443 static u16 nvmet_pci_epf_set_feat(const struct nvmet_ctrl *tctrl,
1444 u8 feat, void *data)
1445 {
1446 struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
1447 struct nvmet_feat_arbitration *arb;
1448 struct nvmet_feat_irq_coalesce *irqc;
1449 struct nvmet_feat_irq_config *irqcfg;
1450 struct nvmet_pci_epf_irq_vector *iv;
1451 u16 status;
1452
1453 switch (feat) {
1454 case NVME_FEAT_ARBITRATION:
1455 arb = data;
1456 if (arb->ab == 0x7)
1457 ctrl->sq_ab = 0;
1458 else
1459 ctrl->sq_ab = 1 << arb->ab;
1460 return NVME_SC_SUCCESS;
1461
1462 case NVME_FEAT_IRQ_COALESCE:
1463 /*
1464 * Since we do not implement precise IRQ coalescing timing,
1465 * ignore the time field.
1466 */
1467 irqc = data;
1468 ctrl->irq_vector_threshold = irqc->thr + 1;
1469 return NVME_SC_SUCCESS;
1470
1471 case NVME_FEAT_IRQ_CONFIG:
1472 irqcfg = data;
1473 mutex_lock(&ctrl->irq_lock);
1474 iv = nvmet_pci_epf_find_irq_vector(ctrl, irqcfg->iv);
1475 if (iv) {
1476 iv->cd = irqcfg->cd;
1477 status = NVME_SC_SUCCESS;
1478 } else {
1479 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
1480 }
1481 mutex_unlock(&ctrl->irq_lock);
1482 return status;
1483
1484 default:
1485 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
1486 }
1487 }
1488
1489 static const struct nvmet_fabrics_ops nvmet_pci_epf_fabrics_ops = {
1490 .owner = THIS_MODULE,
1491 .type = NVMF_TRTYPE_PCI,
1492 .add_port = nvmet_pci_epf_add_port,
1493 .remove_port = nvmet_pci_epf_remove_port,
1494 .queue_response = nvmet_pci_epf_queue_response,
1495 .get_mdts = nvmet_pci_epf_get_mdts,
1496 .create_cq = nvmet_pci_epf_create_cq,
1497 .delete_cq = nvmet_pci_epf_delete_cq,
1498 .create_sq = nvmet_pci_epf_create_sq,
1499 .delete_sq = nvmet_pci_epf_delete_sq,
1500 .get_feature = nvmet_pci_epf_get_feat,
1501 .set_feature = nvmet_pci_epf_set_feat,
1502 };
1503
1504 static void nvmet_pci_epf_cq_work(struct work_struct *work);
1505
nvmet_pci_epf_init_queue(struct nvmet_pci_epf_ctrl * ctrl,unsigned int qid,bool sq)1506 static void nvmet_pci_epf_init_queue(struct nvmet_pci_epf_ctrl *ctrl,
1507 unsigned int qid, bool sq)
1508 {
1509 struct nvmet_pci_epf_queue *queue;
1510
1511 if (sq) {
1512 queue = &ctrl->sq[qid];
1513 set_bit(NVMET_PCI_EPF_Q_IS_SQ, &queue->flags);
1514 } else {
1515 queue = &ctrl->cq[qid];
1516 INIT_DELAYED_WORK(&queue->work, nvmet_pci_epf_cq_work);
1517 }
1518 queue->ctrl = ctrl;
1519 queue->qid = qid;
1520 spin_lock_init(&queue->lock);
1521 INIT_LIST_HEAD(&queue->list);
1522 }
1523
nvmet_pci_epf_alloc_queues(struct nvmet_pci_epf_ctrl * ctrl)1524 static int nvmet_pci_epf_alloc_queues(struct nvmet_pci_epf_ctrl *ctrl)
1525 {
1526 unsigned int qid;
1527
1528 ctrl->sq = kcalloc(ctrl->nr_queues,
1529 sizeof(struct nvmet_pci_epf_queue), GFP_KERNEL);
1530 if (!ctrl->sq)
1531 return -ENOMEM;
1532
1533 ctrl->cq = kcalloc(ctrl->nr_queues,
1534 sizeof(struct nvmet_pci_epf_queue), GFP_KERNEL);
1535 if (!ctrl->cq) {
1536 kfree(ctrl->sq);
1537 ctrl->sq = NULL;
1538 return -ENOMEM;
1539 }
1540
1541 for (qid = 0; qid < ctrl->nr_queues; qid++) {
1542 nvmet_pci_epf_init_queue(ctrl, qid, true);
1543 nvmet_pci_epf_init_queue(ctrl, qid, false);
1544 }
1545
1546 return 0;
1547 }
1548
nvmet_pci_epf_free_queues(struct nvmet_pci_epf_ctrl * ctrl)1549 static void nvmet_pci_epf_free_queues(struct nvmet_pci_epf_ctrl *ctrl)
1550 {
1551 kfree(ctrl->sq);
1552 ctrl->sq = NULL;
1553 kfree(ctrl->cq);
1554 ctrl->cq = NULL;
1555 }
1556
nvmet_pci_epf_map_queue(struct nvmet_pci_epf_ctrl * ctrl,struct nvmet_pci_epf_queue * queue)1557 static int nvmet_pci_epf_map_queue(struct nvmet_pci_epf_ctrl *ctrl,
1558 struct nvmet_pci_epf_queue *queue)
1559 {
1560 struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf;
1561 int ret;
1562
1563 ret = nvmet_pci_epf_mem_map(nvme_epf, queue->pci_addr,
1564 queue->pci_size, &queue->pci_map);
1565 if (ret) {
1566 dev_err(ctrl->dev, "Failed to map queue %u (err=%d)\n",
1567 queue->qid, ret);
1568 return ret;
1569 }
1570
1571 if (queue->pci_map.pci_size < queue->pci_size) {
1572 dev_err(ctrl->dev, "Invalid partial mapping of queue %u\n",
1573 queue->qid);
1574 nvmet_pci_epf_mem_unmap(nvme_epf, &queue->pci_map);
1575 return -ENOMEM;
1576 }
1577
1578 return 0;
1579 }
1580
nvmet_pci_epf_unmap_queue(struct nvmet_pci_epf_ctrl * ctrl,struct nvmet_pci_epf_queue * queue)1581 static inline void nvmet_pci_epf_unmap_queue(struct nvmet_pci_epf_ctrl *ctrl,
1582 struct nvmet_pci_epf_queue *queue)
1583 {
1584 nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &queue->pci_map);
1585 }
1586
nvmet_pci_epf_exec_iod_work(struct work_struct * work)1587 static void nvmet_pci_epf_exec_iod_work(struct work_struct *work)
1588 {
1589 struct nvmet_pci_epf_iod *iod =
1590 container_of(work, struct nvmet_pci_epf_iod, work);
1591 struct nvmet_req *req = &iod->req;
1592 int ret;
1593
1594 if (!iod->ctrl->link_up) {
1595 nvmet_pci_epf_free_iod(iod);
1596 return;
1597 }
1598
1599 if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &iod->sq->flags)) {
1600 iod->status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
1601 goto complete;
1602 }
1603
1604 if (!nvmet_req_init(req, &iod->cq->nvme_cq, &iod->sq->nvme_sq,
1605 &nvmet_pci_epf_fabrics_ops))
1606 goto complete;
1607
1608 iod->data_len = nvmet_req_transfer_len(req);
1609 if (iod->data_len) {
1610 /*
1611 * Get the data DMA transfer direction. Here "device" means the
1612 * PCI root-complex host.
1613 */
1614 if (nvme_is_write(&iod->cmd))
1615 iod->dma_dir = DMA_FROM_DEVICE;
1616 else
1617 iod->dma_dir = DMA_TO_DEVICE;
1618
1619 /*
1620 * Setup the command data buffer and get the command data from
1621 * the host if needed.
1622 */
1623 ret = nvmet_pci_epf_alloc_iod_data_buf(iod);
1624 if (!ret && iod->dma_dir == DMA_FROM_DEVICE)
1625 ret = nvmet_pci_epf_transfer_iod_data(iod);
1626 if (ret) {
1627 nvmet_req_uninit(req);
1628 goto complete;
1629 }
1630 }
1631
1632 req->execute(req);
1633
1634 /*
1635 * If we do not have data to transfer after the command execution
1636 * finishes, nvmet_pci_epf_queue_response() will complete the command
1637 * directly. No need to wait for the completion in this case.
1638 */
1639 if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE)
1640 return;
1641
1642 wait_for_completion(&iod->done);
1643
1644 if (iod->status == NVME_SC_SUCCESS) {
1645 WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE);
1646 nvmet_pci_epf_transfer_iod_data(iod);
1647 }
1648
1649 complete:
1650 nvmet_pci_epf_complete_iod(iod);
1651 }
1652
nvmet_pci_epf_process_sq(struct nvmet_pci_epf_ctrl * ctrl,struct nvmet_pci_epf_queue * sq)1653 static int nvmet_pci_epf_process_sq(struct nvmet_pci_epf_ctrl *ctrl,
1654 struct nvmet_pci_epf_queue *sq)
1655 {
1656 struct nvmet_pci_epf_iod *iod;
1657 int ret, n = 0;
1658 u16 head = sq->head;
1659
1660 sq->tail = nvmet_pci_epf_bar_read32(ctrl, sq->db);
1661 while (head != sq->tail && (!ctrl->sq_ab || n < ctrl->sq_ab)) {
1662 iod = nvmet_pci_epf_alloc_iod(sq);
1663 if (!iod)
1664 break;
1665
1666 /* Get the NVMe command submitted by the host. */
1667 ret = nvmet_pci_epf_transfer(ctrl, &iod->cmd,
1668 sq->pci_addr + head * sq->qes,
1669 sq->qes, DMA_FROM_DEVICE);
1670 if (ret) {
1671 /* Not much we can do... */
1672 nvmet_pci_epf_free_iod(iod);
1673 break;
1674 }
1675
1676 dev_dbg(ctrl->dev, "SQ[%u]: head %u, tail %u, command %s\n",
1677 sq->qid, head, sq->tail,
1678 nvmet_pci_epf_iod_name(iod));
1679
1680 head++;
1681 if (head == sq->depth)
1682 head = 0;
1683 WRITE_ONCE(sq->head, head);
1684 n++;
1685
1686 queue_work_on(WORK_CPU_UNBOUND, sq->iod_wq, &iod->work);
1687
1688 sq->tail = nvmet_pci_epf_bar_read32(ctrl, sq->db);
1689 }
1690
1691 return n;
1692 }
1693
nvmet_pci_epf_poll_sqs_work(struct work_struct * work)1694 static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work)
1695 {
1696 struct nvmet_pci_epf_ctrl *ctrl =
1697 container_of(work, struct nvmet_pci_epf_ctrl, poll_sqs.work);
1698 struct nvmet_pci_epf_queue *sq;
1699 unsigned long limit = jiffies;
1700 unsigned long last = 0;
1701 int i, nr_sqs;
1702
1703 while (ctrl->link_up && ctrl->enabled) {
1704 nr_sqs = 0;
1705 /* Do round-robin arbitration. */
1706 for (i = 0; i < ctrl->nr_queues; i++) {
1707 sq = &ctrl->sq[i];
1708 if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
1709 continue;
1710 if (nvmet_pci_epf_process_sq(ctrl, sq))
1711 nr_sqs++;
1712 }
1713
1714 /*
1715 * If we have been running for a while, reschedule to let other
1716 * tasks run and to avoid RCU stalls.
1717 */
1718 if (time_is_before_jiffies(limit + secs_to_jiffies(1))) {
1719 cond_resched();
1720 limit = jiffies;
1721 continue;
1722 }
1723
1724 if (nr_sqs) {
1725 last = jiffies;
1726 continue;
1727 }
1728
1729 /*
1730 * If we have not received any command on any queue for more
1731 * than NVMET_PCI_EPF_SQ_POLL_IDLE, assume we are idle and
1732 * reschedule. This avoids "burning" a CPU when the controller
1733 * is idle for a long time.
1734 */
1735 if (time_is_before_jiffies(last + NVMET_PCI_EPF_SQ_POLL_IDLE))
1736 break;
1737
1738 cpu_relax();
1739 }
1740
1741 schedule_delayed_work(&ctrl->poll_sqs, NVMET_PCI_EPF_SQ_POLL_INTERVAL);
1742 }
1743
nvmet_pci_epf_cq_work(struct work_struct * work)1744 static void nvmet_pci_epf_cq_work(struct work_struct *work)
1745 {
1746 struct nvmet_pci_epf_queue *cq =
1747 container_of(work, struct nvmet_pci_epf_queue, work.work);
1748 struct nvmet_pci_epf_ctrl *ctrl = cq->ctrl;
1749 struct nvme_completion *cqe;
1750 struct nvmet_pci_epf_iod *iod;
1751 unsigned long flags;
1752 int ret, n = 0;
1753
1754 ret = nvmet_pci_epf_map_queue(ctrl, cq);
1755 if (ret)
1756 goto again;
1757
1758 while (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags) && ctrl->link_up) {
1759
1760 /* Check that the CQ is not full. */
1761 cq->head = nvmet_pci_epf_bar_read32(ctrl, cq->db);
1762 if (cq->head == cq->tail + 1) {
1763 ret = -EAGAIN;
1764 break;
1765 }
1766
1767 spin_lock_irqsave(&cq->lock, flags);
1768 iod = list_first_entry_or_null(&cq->list,
1769 struct nvmet_pci_epf_iod, link);
1770 if (iod)
1771 list_del_init(&iod->link);
1772 spin_unlock_irqrestore(&cq->lock, flags);
1773
1774 if (!iod)
1775 break;
1776
1777 /*
1778 * Post the IOD completion entry. If the IOD request was
1779 * executed (req->execute() called), the CQE is already
1780 * initialized. However, the IOD may have been failed before
1781 * that, leaving the CQE not properly initialized. So always
1782 * initialize it here.
1783 */
1784 cqe = &iod->cqe;
1785 cqe->sq_head = cpu_to_le16(READ_ONCE(iod->sq->head));
1786 cqe->sq_id = cpu_to_le16(iod->sq->qid);
1787 cqe->command_id = iod->cmd.common.command_id;
1788 cqe->status = cpu_to_le16((iod->status << 1) | cq->phase);
1789
1790 dev_dbg(ctrl->dev,
1791 "CQ[%u]: %s status 0x%x, result 0x%llx, head %u, tail %u, phase %u\n",
1792 cq->qid, nvmet_pci_epf_iod_name(iod), iod->status,
1793 le64_to_cpu(cqe->result.u64), cq->head, cq->tail,
1794 cq->phase);
1795
1796 memcpy_toio(cq->pci_map.virt_addr + cq->tail * cq->qes,
1797 cqe, cq->qes);
1798
1799 cq->tail++;
1800 if (cq->tail >= cq->depth) {
1801 cq->tail = 0;
1802 cq->phase ^= 1;
1803 }
1804
1805 nvmet_pci_epf_free_iod(iod);
1806
1807 /* Signal the host. */
1808 nvmet_pci_epf_raise_irq(ctrl, cq, false);
1809 n++;
1810 }
1811
1812 nvmet_pci_epf_unmap_queue(ctrl, cq);
1813
1814 /*
1815 * We do not support precise IRQ coalescing time (100ns units as per
1816 * NVMe specifications). So if we have posted completion entries without
1817 * reaching the interrupt coalescing threshold, raise an interrupt.
1818 */
1819 if (n)
1820 nvmet_pci_epf_raise_irq(ctrl, cq, true);
1821
1822 again:
1823 if (ret < 0)
1824 queue_delayed_work(system_highpri_wq, &cq->work,
1825 NVMET_PCI_EPF_CQ_RETRY_INTERVAL);
1826 }
1827
nvmet_pci_epf_clear_ctrl_config(struct nvmet_pci_epf_ctrl * ctrl)1828 static void nvmet_pci_epf_clear_ctrl_config(struct nvmet_pci_epf_ctrl *ctrl)
1829 {
1830 struct nvmet_ctrl *tctrl = ctrl->tctrl;
1831
1832 /* Initialize controller status. */
1833 tctrl->csts = 0;
1834 ctrl->csts = 0;
1835 nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts);
1836
1837 /* Initialize controller configuration and start polling. */
1838 tctrl->cc = 0;
1839 ctrl->cc = 0;
1840 nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CC, ctrl->cc);
1841 }
1842
nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl * ctrl)1843 static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
1844 {
1845 u64 pci_addr, asq, acq;
1846 u32 aqa;
1847 u16 status, qsize;
1848
1849 if (ctrl->enabled)
1850 return 0;
1851
1852 dev_info(ctrl->dev, "Enabling controller\n");
1853
1854 ctrl->mps_shift = nvmet_cc_mps(ctrl->cc) + 12;
1855 ctrl->mps = 1UL << ctrl->mps_shift;
1856 ctrl->mps_mask = ctrl->mps - 1;
1857
1858 ctrl->io_sqes = 1UL << nvmet_cc_iosqes(ctrl->cc);
1859 if (ctrl->io_sqes < sizeof(struct nvme_command)) {
1860 dev_err(ctrl->dev, "Unsupported I/O SQES %zu (need %zu)\n",
1861 ctrl->io_sqes, sizeof(struct nvme_command));
1862 goto err;
1863 }
1864
1865 ctrl->io_cqes = 1UL << nvmet_cc_iocqes(ctrl->cc);
1866 if (ctrl->io_cqes < sizeof(struct nvme_completion)) {
1867 dev_err(ctrl->dev, "Unsupported I/O CQES %zu (need %zu)\n",
1868 ctrl->io_sqes, sizeof(struct nvme_completion));
1869 goto err;
1870 }
1871
1872 /* Create the admin queue. */
1873 aqa = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_AQA);
1874 asq = nvmet_pci_epf_bar_read64(ctrl, NVME_REG_ASQ);
1875 acq = nvmet_pci_epf_bar_read64(ctrl, NVME_REG_ACQ);
1876
1877 qsize = (aqa & 0x0fff0000) >> 16;
1878 pci_addr = acq & GENMASK_ULL(63, 12);
1879 status = nvmet_pci_epf_create_cq(ctrl->tctrl, 0,
1880 NVME_CQ_IRQ_ENABLED | NVME_QUEUE_PHYS_CONTIG,
1881 qsize, pci_addr, 0);
1882 if (status != NVME_SC_SUCCESS) {
1883 dev_err(ctrl->dev, "Failed to create admin completion queue\n");
1884 goto err;
1885 }
1886
1887 qsize = aqa & 0x00000fff;
1888 pci_addr = asq & GENMASK_ULL(63, 12);
1889 status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, NVME_QUEUE_PHYS_CONTIG,
1890 qsize, pci_addr);
1891 if (status != NVME_SC_SUCCESS) {
1892 dev_err(ctrl->dev, "Failed to create admin submission queue\n");
1893 nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
1894 goto err;
1895 }
1896
1897 ctrl->sq_ab = NVMET_PCI_EPF_SQ_AB;
1898 ctrl->irq_vector_threshold = NVMET_PCI_EPF_IV_THRESHOLD;
1899 ctrl->enabled = true;
1900 ctrl->csts = NVME_CSTS_RDY;
1901
1902 /* Start polling the controller SQs. */
1903 schedule_delayed_work(&ctrl->poll_sqs, 0);
1904
1905 return 0;
1906
1907 err:
1908 nvmet_pci_epf_clear_ctrl_config(ctrl);
1909 return -EINVAL;
1910 }
1911
nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl * ctrl,bool shutdown)1912 static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl,
1913 bool shutdown)
1914 {
1915 int qid;
1916
1917 if (!ctrl->enabled)
1918 return;
1919
1920 dev_info(ctrl->dev, "%s controller\n",
1921 shutdown ? "Shutting down" : "Disabling");
1922
1923 ctrl->enabled = false;
1924 cancel_delayed_work_sync(&ctrl->poll_sqs);
1925
1926 /* Delete all I/O queues first. */
1927 for (qid = 1; qid < ctrl->nr_queues; qid++)
1928 nvmet_pci_epf_delete_sq(ctrl->tctrl, qid);
1929
1930 for (qid = 1; qid < ctrl->nr_queues; qid++)
1931 nvmet_pci_epf_delete_cq(ctrl->tctrl, qid);
1932
1933 /* Delete the admin queue last. */
1934 nvmet_pci_epf_delete_sq(ctrl->tctrl, 0);
1935 nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
1936
1937 ctrl->csts &= ~NVME_CSTS_RDY;
1938 if (shutdown) {
1939 ctrl->csts |= NVME_CSTS_SHST_CMPLT;
1940 ctrl->cc &= ~NVME_CC_ENABLE;
1941 nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CC, ctrl->cc);
1942 }
1943 }
1944
nvmet_pci_epf_poll_cc_work(struct work_struct * work)1945 static void nvmet_pci_epf_poll_cc_work(struct work_struct *work)
1946 {
1947 struct nvmet_pci_epf_ctrl *ctrl =
1948 container_of(work, struct nvmet_pci_epf_ctrl, poll_cc.work);
1949 u32 old_cc, new_cc;
1950 int ret;
1951
1952 if (!ctrl->tctrl)
1953 return;
1954
1955 old_cc = ctrl->cc;
1956 new_cc = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_CC);
1957 if (new_cc == old_cc)
1958 goto reschedule_work;
1959
1960 ctrl->cc = new_cc;
1961
1962 if (nvmet_cc_en(new_cc) && !nvmet_cc_en(old_cc)) {
1963 ret = nvmet_pci_epf_enable_ctrl(ctrl);
1964 if (ret)
1965 goto reschedule_work;
1966 }
1967
1968 if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc))
1969 nvmet_pci_epf_disable_ctrl(ctrl, false);
1970
1971 if (nvmet_cc_shn(new_cc) && !nvmet_cc_shn(old_cc))
1972 nvmet_pci_epf_disable_ctrl(ctrl, true);
1973
1974 if (!nvmet_cc_shn(new_cc) && nvmet_cc_shn(old_cc))
1975 ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
1976
1977 nvmet_update_cc(ctrl->tctrl, ctrl->cc);
1978 nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts);
1979
1980 reschedule_work:
1981 schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL);
1982 }
1983
nvmet_pci_epf_init_bar(struct nvmet_pci_epf_ctrl * ctrl)1984 static void nvmet_pci_epf_init_bar(struct nvmet_pci_epf_ctrl *ctrl)
1985 {
1986 struct nvmet_ctrl *tctrl = ctrl->tctrl;
1987
1988 ctrl->bar = ctrl->nvme_epf->reg_bar;
1989
1990 /* Copy the target controller capabilities as a base. */
1991 ctrl->cap = tctrl->cap;
1992
1993 /* Contiguous Queues Required (CQR). */
1994 ctrl->cap |= 0x1ULL << 16;
1995
1996 /* Set Doorbell stride to 4B (DSTRB). */
1997 ctrl->cap &= ~GENMASK_ULL(35, 32);
1998
1999 /* Clear NVM Subsystem Reset Supported (NSSRS). */
2000 ctrl->cap &= ~(0x1ULL << 36);
2001
2002 /* Clear Boot Partition Support (BPS). */
2003 ctrl->cap &= ~(0x1ULL << 45);
2004
2005 /* Clear Persistent Memory Region Supported (PMRS). */
2006 ctrl->cap &= ~(0x1ULL << 56);
2007
2008 /* Clear Controller Memory Buffer Supported (CMBS). */
2009 ctrl->cap &= ~(0x1ULL << 57);
2010
2011 nvmet_pci_epf_bar_write64(ctrl, NVME_REG_CAP, ctrl->cap);
2012 nvmet_pci_epf_bar_write32(ctrl, NVME_REG_VS, tctrl->subsys->ver);
2013
2014 nvmet_pci_epf_clear_ctrl_config(ctrl);
2015 }
2016
nvmet_pci_epf_create_ctrl(struct nvmet_pci_epf * nvme_epf,unsigned int max_nr_queues)2017 static int nvmet_pci_epf_create_ctrl(struct nvmet_pci_epf *nvme_epf,
2018 unsigned int max_nr_queues)
2019 {
2020 struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
2021 struct nvmet_alloc_ctrl_args args = {};
2022 char hostnqn[NVMF_NQN_SIZE];
2023 uuid_t id;
2024 int ret;
2025
2026 memset(ctrl, 0, sizeof(*ctrl));
2027 ctrl->dev = &nvme_epf->epf->dev;
2028 mutex_init(&ctrl->irq_lock);
2029 ctrl->nvme_epf = nvme_epf;
2030 ctrl->mdts = nvme_epf->mdts_kb * SZ_1K;
2031 INIT_DELAYED_WORK(&ctrl->poll_cc, nvmet_pci_epf_poll_cc_work);
2032 INIT_DELAYED_WORK(&ctrl->poll_sqs, nvmet_pci_epf_poll_sqs_work);
2033
2034 ret = mempool_init_kmalloc_pool(&ctrl->iod_pool,
2035 max_nr_queues * NVMET_MAX_QUEUE_SIZE,
2036 sizeof(struct nvmet_pci_epf_iod));
2037 if (ret) {
2038 dev_err(ctrl->dev, "Failed to initialize IOD mempool\n");
2039 return ret;
2040 }
2041
2042 ctrl->port = nvmet_pci_epf_find_port(ctrl, nvme_epf->portid);
2043 if (!ctrl->port) {
2044 dev_err(ctrl->dev, "Port not found\n");
2045 ret = -EINVAL;
2046 goto out_mempool_exit;
2047 }
2048
2049 /* Create the target controller. */
2050 uuid_gen(&id);
2051 snprintf(hostnqn, NVMF_NQN_SIZE,
2052 "nqn.2014-08.org.nvmexpress:uuid:%pUb", &id);
2053 args.port = ctrl->port;
2054 args.subsysnqn = nvme_epf->subsysnqn;
2055 memset(&id, 0, sizeof(uuid_t));
2056 args.hostid = &id;
2057 args.hostnqn = hostnqn;
2058 args.ops = &nvmet_pci_epf_fabrics_ops;
2059
2060 ctrl->tctrl = nvmet_alloc_ctrl(&args);
2061 if (!ctrl->tctrl) {
2062 dev_err(ctrl->dev, "Failed to create target controller\n");
2063 ret = -ENOMEM;
2064 goto out_mempool_exit;
2065 }
2066 ctrl->tctrl->drvdata = ctrl;
2067
2068 /* We do not support protection information for now. */
2069 if (ctrl->tctrl->pi_support) {
2070 dev_err(ctrl->dev,
2071 "Protection information (PI) is not supported\n");
2072 ret = -ENOTSUPP;
2073 goto out_put_ctrl;
2074 }
2075
2076 /* Allocate our queues, up to the maximum number. */
2077 ctrl->nr_queues = min(ctrl->tctrl->subsys->max_qid + 1, max_nr_queues);
2078 ret = nvmet_pci_epf_alloc_queues(ctrl);
2079 if (ret)
2080 goto out_put_ctrl;
2081
2082 /*
2083 * Allocate the IRQ vectors descriptors. We cannot have more than the
2084 * maximum number of queues.
2085 */
2086 ret = nvmet_pci_epf_alloc_irq_vectors(ctrl);
2087 if (ret)
2088 goto out_free_queues;
2089
2090 dev_info(ctrl->dev,
2091 "New PCI ctrl \"%s\", %u I/O queues, mdts %u B\n",
2092 ctrl->tctrl->subsys->subsysnqn, ctrl->nr_queues - 1,
2093 ctrl->mdts);
2094
2095 /* Initialize BAR 0 using the target controller CAP. */
2096 nvmet_pci_epf_init_bar(ctrl);
2097
2098 return 0;
2099
2100 out_free_queues:
2101 nvmet_pci_epf_free_queues(ctrl);
2102 out_put_ctrl:
2103 nvmet_ctrl_put(ctrl->tctrl);
2104 ctrl->tctrl = NULL;
2105 out_mempool_exit:
2106 mempool_exit(&ctrl->iod_pool);
2107 return ret;
2108 }
2109
nvmet_pci_epf_start_ctrl(struct nvmet_pci_epf_ctrl * ctrl)2110 static void nvmet_pci_epf_start_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
2111 {
2112 schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL);
2113 }
2114
nvmet_pci_epf_stop_ctrl(struct nvmet_pci_epf_ctrl * ctrl)2115 static void nvmet_pci_epf_stop_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
2116 {
2117 cancel_delayed_work_sync(&ctrl->poll_cc);
2118
2119 nvmet_pci_epf_disable_ctrl(ctrl, false);
2120 nvmet_pci_epf_clear_ctrl_config(ctrl);
2121 }
2122
nvmet_pci_epf_destroy_ctrl(struct nvmet_pci_epf_ctrl * ctrl)2123 static void nvmet_pci_epf_destroy_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
2124 {
2125 if (!ctrl->tctrl)
2126 return;
2127
2128 dev_info(ctrl->dev, "Destroying PCI ctrl \"%s\"\n",
2129 ctrl->tctrl->subsys->subsysnqn);
2130
2131 nvmet_pci_epf_stop_ctrl(ctrl);
2132
2133 nvmet_pci_epf_free_queues(ctrl);
2134 nvmet_pci_epf_free_irq_vectors(ctrl);
2135
2136 nvmet_ctrl_put(ctrl->tctrl);
2137 ctrl->tctrl = NULL;
2138
2139 mempool_exit(&ctrl->iod_pool);
2140 }
2141
nvmet_pci_epf_configure_bar(struct nvmet_pci_epf * nvme_epf)2142 static int nvmet_pci_epf_configure_bar(struct nvmet_pci_epf *nvme_epf)
2143 {
2144 struct pci_epf *epf = nvme_epf->epf;
2145 const struct pci_epc_features *epc_features = nvme_epf->epc_features;
2146 size_t reg_size, reg_bar_size;
2147 size_t msix_table_size = 0;
2148
2149 /*
2150 * The first free BAR will be our register BAR and per NVMe
2151 * specifications, it must be BAR 0.
2152 */
2153 if (pci_epc_get_first_free_bar(epc_features) != BAR_0) {
2154 dev_err(&epf->dev, "BAR 0 is not free\n");
2155 return -ENODEV;
2156 }
2157
2158 /*
2159 * While NVMe PCIe Transport Specification 1.1, section 2.1.10, claims
2160 * that the BAR0 type is Implementation Specific, in NVMe 1.1, the type
2161 * is required to be 64-bit. Thus, for interoperability, always set the
2162 * type to 64-bit. In the rare case that the PCI EPC does not support
2163 * configuring BAR0 as 64-bit, the call to pci_epc_set_bar() will fail,
2164 * and we will return failure back to the user.
2165 */
2166 epf->bar[BAR_0].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64;
2167
2168 /*
2169 * Calculate the size of the register bar: NVMe registers first with
2170 * enough space for the doorbells, followed by the MSI-X table
2171 * if supported.
2172 */
2173 reg_size = NVME_REG_DBS + (NVMET_NR_QUEUES * 2 * sizeof(u32));
2174 reg_size = ALIGN(reg_size, 8);
2175
2176 if (epc_features->msix_capable) {
2177 size_t pba_size;
2178
2179 msix_table_size = PCI_MSIX_ENTRY_SIZE * epf->msix_interrupts;
2180 nvme_epf->msix_table_offset = reg_size;
2181 pba_size = ALIGN(DIV_ROUND_UP(epf->msix_interrupts, 8), 8);
2182
2183 reg_size += msix_table_size + pba_size;
2184 }
2185
2186 if (epc_features->bar[BAR_0].type == BAR_FIXED) {
2187 if (reg_size > epc_features->bar[BAR_0].fixed_size) {
2188 dev_err(&epf->dev,
2189 "BAR 0 size %llu B too small, need %zu B\n",
2190 epc_features->bar[BAR_0].fixed_size,
2191 reg_size);
2192 return -ENOMEM;
2193 }
2194 reg_bar_size = epc_features->bar[BAR_0].fixed_size;
2195 } else {
2196 reg_bar_size = ALIGN(reg_size, max(epc_features->align, 4096));
2197 }
2198
2199 nvme_epf->reg_bar = pci_epf_alloc_space(epf, reg_bar_size, BAR_0,
2200 epc_features, PRIMARY_INTERFACE);
2201 if (!nvme_epf->reg_bar) {
2202 dev_err(&epf->dev, "Failed to allocate BAR 0\n");
2203 return -ENOMEM;
2204 }
2205 memset(nvme_epf->reg_bar, 0, reg_bar_size);
2206
2207 return 0;
2208 }
2209
nvmet_pci_epf_free_bar(struct nvmet_pci_epf * nvme_epf)2210 static void nvmet_pci_epf_free_bar(struct nvmet_pci_epf *nvme_epf)
2211 {
2212 struct pci_epf *epf = nvme_epf->epf;
2213
2214 if (!nvme_epf->reg_bar)
2215 return;
2216
2217 pci_epf_free_space(epf, nvme_epf->reg_bar, BAR_0, PRIMARY_INTERFACE);
2218 nvme_epf->reg_bar = NULL;
2219 }
2220
nvmet_pci_epf_clear_bar(struct nvmet_pci_epf * nvme_epf)2221 static void nvmet_pci_epf_clear_bar(struct nvmet_pci_epf *nvme_epf)
2222 {
2223 struct pci_epf *epf = nvme_epf->epf;
2224
2225 pci_epc_clear_bar(epf->epc, epf->func_no, epf->vfunc_no,
2226 &epf->bar[BAR_0]);
2227 }
2228
nvmet_pci_epf_init_irq(struct nvmet_pci_epf * nvme_epf)2229 static int nvmet_pci_epf_init_irq(struct nvmet_pci_epf *nvme_epf)
2230 {
2231 const struct pci_epc_features *epc_features = nvme_epf->epc_features;
2232 struct pci_epf *epf = nvme_epf->epf;
2233 int ret;
2234
2235 /* Enable MSI-X if supported, otherwise, use MSI. */
2236 if (epc_features->msix_capable && epf->msix_interrupts) {
2237 ret = pci_epc_set_msix(epf->epc, epf->func_no, epf->vfunc_no,
2238 epf->msix_interrupts, BAR_0,
2239 nvme_epf->msix_table_offset);
2240 if (ret) {
2241 dev_err(&epf->dev, "Failed to configure MSI-X\n");
2242 return ret;
2243 }
2244
2245 nvme_epf->nr_vectors = epf->msix_interrupts;
2246 nvme_epf->irq_type = PCI_IRQ_MSIX;
2247
2248 return 0;
2249 }
2250
2251 if (epc_features->msi_capable && epf->msi_interrupts) {
2252 ret = pci_epc_set_msi(epf->epc, epf->func_no, epf->vfunc_no,
2253 epf->msi_interrupts);
2254 if (ret) {
2255 dev_err(&epf->dev, "Failed to configure MSI\n");
2256 return ret;
2257 }
2258
2259 nvme_epf->nr_vectors = epf->msi_interrupts;
2260 nvme_epf->irq_type = PCI_IRQ_MSI;
2261
2262 return 0;
2263 }
2264
2265 /* MSI and MSI-X are not supported: fall back to INTx. */
2266 nvme_epf->nr_vectors = 1;
2267 nvme_epf->irq_type = PCI_IRQ_INTX;
2268
2269 return 0;
2270 }
2271
nvmet_pci_epf_epc_init(struct pci_epf * epf)2272 static int nvmet_pci_epf_epc_init(struct pci_epf *epf)
2273 {
2274 struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2275 const struct pci_epc_features *epc_features = nvme_epf->epc_features;
2276 struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
2277 unsigned int max_nr_queues = NVMET_NR_QUEUES;
2278 int ret;
2279
2280 /* For now, do not support virtual functions. */
2281 if (epf->vfunc_no > 0) {
2282 dev_err(&epf->dev, "Virtual functions are not supported\n");
2283 return -EINVAL;
2284 }
2285
2286 /*
2287 * Cap the maximum number of queues we can support on the controller
2288 * with the number of IRQs we can use.
2289 */
2290 if (epc_features->msix_capable && epf->msix_interrupts) {
2291 dev_info(&epf->dev,
2292 "PCI endpoint controller supports MSI-X, %u vectors\n",
2293 epf->msix_interrupts);
2294 max_nr_queues = min(max_nr_queues, epf->msix_interrupts);
2295 } else if (epc_features->msi_capable && epf->msi_interrupts) {
2296 dev_info(&epf->dev,
2297 "PCI endpoint controller supports MSI, %u vectors\n",
2298 epf->msi_interrupts);
2299 max_nr_queues = min(max_nr_queues, epf->msi_interrupts);
2300 }
2301
2302 if (max_nr_queues < 2) {
2303 dev_err(&epf->dev, "Invalid maximum number of queues %u\n",
2304 max_nr_queues);
2305 return -EINVAL;
2306 }
2307
2308 /* Create the target controller. */
2309 ret = nvmet_pci_epf_create_ctrl(nvme_epf, max_nr_queues);
2310 if (ret) {
2311 dev_err(&epf->dev,
2312 "Failed to create NVMe PCI target controller (err=%d)\n",
2313 ret);
2314 return ret;
2315 }
2316
2317 /* Set device ID, class, etc. */
2318 epf->header->vendorid = ctrl->tctrl->subsys->vendor_id;
2319 epf->header->subsys_vendor_id = ctrl->tctrl->subsys->subsys_vendor_id;
2320 ret = pci_epc_write_header(epf->epc, epf->func_no, epf->vfunc_no,
2321 epf->header);
2322 if (ret) {
2323 dev_err(&epf->dev,
2324 "Failed to write configuration header (err=%d)\n", ret);
2325 goto out_destroy_ctrl;
2326 }
2327
2328 ret = pci_epc_set_bar(epf->epc, epf->func_no, epf->vfunc_no,
2329 &epf->bar[BAR_0]);
2330 if (ret) {
2331 dev_err(&epf->dev, "Failed to set BAR 0 (err=%d)\n", ret);
2332 goto out_destroy_ctrl;
2333 }
2334
2335 /*
2336 * Enable interrupts and start polling the controller BAR if we do not
2337 * have a link up notifier.
2338 */
2339 ret = nvmet_pci_epf_init_irq(nvme_epf);
2340 if (ret)
2341 goto out_clear_bar;
2342
2343 if (!epc_features->linkup_notifier) {
2344 ctrl->link_up = true;
2345 nvmet_pci_epf_start_ctrl(&nvme_epf->ctrl);
2346 }
2347
2348 return 0;
2349
2350 out_clear_bar:
2351 nvmet_pci_epf_clear_bar(nvme_epf);
2352 out_destroy_ctrl:
2353 nvmet_pci_epf_destroy_ctrl(&nvme_epf->ctrl);
2354 return ret;
2355 }
2356
nvmet_pci_epf_epc_deinit(struct pci_epf * epf)2357 static void nvmet_pci_epf_epc_deinit(struct pci_epf *epf)
2358 {
2359 struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2360 struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
2361
2362 ctrl->link_up = false;
2363 nvmet_pci_epf_destroy_ctrl(ctrl);
2364
2365 nvmet_pci_epf_deinit_dma(nvme_epf);
2366 nvmet_pci_epf_clear_bar(nvme_epf);
2367 }
2368
nvmet_pci_epf_link_up(struct pci_epf * epf)2369 static int nvmet_pci_epf_link_up(struct pci_epf *epf)
2370 {
2371 struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2372 struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
2373
2374 ctrl->link_up = true;
2375 nvmet_pci_epf_start_ctrl(ctrl);
2376
2377 return 0;
2378 }
2379
nvmet_pci_epf_link_down(struct pci_epf * epf)2380 static int nvmet_pci_epf_link_down(struct pci_epf *epf)
2381 {
2382 struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2383 struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl;
2384
2385 ctrl->link_up = false;
2386 nvmet_pci_epf_stop_ctrl(ctrl);
2387
2388 return 0;
2389 }
2390
2391 static const struct pci_epc_event_ops nvmet_pci_epf_event_ops = {
2392 .epc_init = nvmet_pci_epf_epc_init,
2393 .epc_deinit = nvmet_pci_epf_epc_deinit,
2394 .link_up = nvmet_pci_epf_link_up,
2395 .link_down = nvmet_pci_epf_link_down,
2396 };
2397
nvmet_pci_epf_bind(struct pci_epf * epf)2398 static int nvmet_pci_epf_bind(struct pci_epf *epf)
2399 {
2400 struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2401 const struct pci_epc_features *epc_features;
2402 struct pci_epc *epc = epf->epc;
2403 int ret;
2404
2405 if (WARN_ON_ONCE(!epc))
2406 return -EINVAL;
2407
2408 epc_features = pci_epc_get_features(epc, epf->func_no, epf->vfunc_no);
2409 if (!epc_features) {
2410 dev_err(&epf->dev, "epc_features not implemented\n");
2411 return -EOPNOTSUPP;
2412 }
2413 nvme_epf->epc_features = epc_features;
2414
2415 ret = nvmet_pci_epf_configure_bar(nvme_epf);
2416 if (ret)
2417 return ret;
2418
2419 nvmet_pci_epf_init_dma(nvme_epf);
2420
2421 return 0;
2422 }
2423
nvmet_pci_epf_unbind(struct pci_epf * epf)2424 static void nvmet_pci_epf_unbind(struct pci_epf *epf)
2425 {
2426 struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2427 struct pci_epc *epc = epf->epc;
2428
2429 nvmet_pci_epf_destroy_ctrl(&nvme_epf->ctrl);
2430
2431 if (epc->init_complete) {
2432 nvmet_pci_epf_deinit_dma(nvme_epf);
2433 nvmet_pci_epf_clear_bar(nvme_epf);
2434 }
2435
2436 nvmet_pci_epf_free_bar(nvme_epf);
2437 }
2438
2439 static struct pci_epf_header nvme_epf_pci_header = {
2440 .vendorid = PCI_ANY_ID,
2441 .deviceid = PCI_ANY_ID,
2442 .progif_code = 0x02, /* NVM Express */
2443 .baseclass_code = PCI_BASE_CLASS_STORAGE,
2444 .subclass_code = 0x08, /* Non-Volatile Memory controller */
2445 .interrupt_pin = PCI_INTERRUPT_INTA,
2446 };
2447
nvmet_pci_epf_probe(struct pci_epf * epf,const struct pci_epf_device_id * id)2448 static int nvmet_pci_epf_probe(struct pci_epf *epf,
2449 const struct pci_epf_device_id *id)
2450 {
2451 struct nvmet_pci_epf *nvme_epf;
2452 int ret;
2453
2454 nvme_epf = devm_kzalloc(&epf->dev, sizeof(*nvme_epf), GFP_KERNEL);
2455 if (!nvme_epf)
2456 return -ENOMEM;
2457
2458 ret = devm_mutex_init(&epf->dev, &nvme_epf->mmio_lock);
2459 if (ret)
2460 return ret;
2461
2462 nvme_epf->epf = epf;
2463 nvme_epf->mdts_kb = NVMET_PCI_EPF_MDTS_KB;
2464
2465 epf->event_ops = &nvmet_pci_epf_event_ops;
2466 epf->header = &nvme_epf_pci_header;
2467 epf_set_drvdata(epf, nvme_epf);
2468
2469 return 0;
2470 }
2471
2472 #define to_nvme_epf(epf_group) \
2473 container_of(epf_group, struct nvmet_pci_epf, group)
2474
nvmet_pci_epf_portid_show(struct config_item * item,char * page)2475 static ssize_t nvmet_pci_epf_portid_show(struct config_item *item, char *page)
2476 {
2477 struct config_group *group = to_config_group(item);
2478 struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2479
2480 return sysfs_emit(page, "%u\n", le16_to_cpu(nvme_epf->portid));
2481 }
2482
nvmet_pci_epf_portid_store(struct config_item * item,const char * page,size_t len)2483 static ssize_t nvmet_pci_epf_portid_store(struct config_item *item,
2484 const char *page, size_t len)
2485 {
2486 struct config_group *group = to_config_group(item);
2487 struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2488 u16 portid;
2489
2490 /* Do not allow setting this when the function is already started. */
2491 if (nvme_epf->ctrl.tctrl)
2492 return -EBUSY;
2493
2494 if (!len)
2495 return -EINVAL;
2496
2497 if (kstrtou16(page, 0, &portid))
2498 return -EINVAL;
2499
2500 nvme_epf->portid = cpu_to_le16(portid);
2501
2502 return len;
2503 }
2504
2505 CONFIGFS_ATTR(nvmet_pci_epf_, portid);
2506
nvmet_pci_epf_subsysnqn_show(struct config_item * item,char * page)2507 static ssize_t nvmet_pci_epf_subsysnqn_show(struct config_item *item,
2508 char *page)
2509 {
2510 struct config_group *group = to_config_group(item);
2511 struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2512
2513 return sysfs_emit(page, "%s\n", nvme_epf->subsysnqn);
2514 }
2515
nvmet_pci_epf_subsysnqn_store(struct config_item * item,const char * page,size_t len)2516 static ssize_t nvmet_pci_epf_subsysnqn_store(struct config_item *item,
2517 const char *page, size_t len)
2518 {
2519 struct config_group *group = to_config_group(item);
2520 struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2521
2522 /* Do not allow setting this when the function is already started. */
2523 if (nvme_epf->ctrl.tctrl)
2524 return -EBUSY;
2525
2526 if (!len)
2527 return -EINVAL;
2528
2529 strscpy(nvme_epf->subsysnqn, page, len);
2530
2531 return len;
2532 }
2533
2534 CONFIGFS_ATTR(nvmet_pci_epf_, subsysnqn);
2535
nvmet_pci_epf_mdts_kb_show(struct config_item * item,char * page)2536 static ssize_t nvmet_pci_epf_mdts_kb_show(struct config_item *item, char *page)
2537 {
2538 struct config_group *group = to_config_group(item);
2539 struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2540
2541 return sysfs_emit(page, "%u\n", nvme_epf->mdts_kb);
2542 }
2543
nvmet_pci_epf_mdts_kb_store(struct config_item * item,const char * page,size_t len)2544 static ssize_t nvmet_pci_epf_mdts_kb_store(struct config_item *item,
2545 const char *page, size_t len)
2546 {
2547 struct config_group *group = to_config_group(item);
2548 struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group);
2549 unsigned long mdts_kb;
2550 int ret;
2551
2552 if (nvme_epf->ctrl.tctrl)
2553 return -EBUSY;
2554
2555 ret = kstrtoul(page, 0, &mdts_kb);
2556 if (ret)
2557 return ret;
2558 if (!mdts_kb)
2559 mdts_kb = NVMET_PCI_EPF_MDTS_KB;
2560 else if (mdts_kb > NVMET_PCI_EPF_MAX_MDTS_KB)
2561 mdts_kb = NVMET_PCI_EPF_MAX_MDTS_KB;
2562
2563 if (!is_power_of_2(mdts_kb))
2564 return -EINVAL;
2565
2566 nvme_epf->mdts_kb = mdts_kb;
2567
2568 return len;
2569 }
2570
2571 CONFIGFS_ATTR(nvmet_pci_epf_, mdts_kb);
2572
2573 static struct configfs_attribute *nvmet_pci_epf_attrs[] = {
2574 &nvmet_pci_epf_attr_portid,
2575 &nvmet_pci_epf_attr_subsysnqn,
2576 &nvmet_pci_epf_attr_mdts_kb,
2577 NULL,
2578 };
2579
2580 static const struct config_item_type nvmet_pci_epf_group_type = {
2581 .ct_attrs = nvmet_pci_epf_attrs,
2582 .ct_owner = THIS_MODULE,
2583 };
2584
nvmet_pci_epf_add_cfs(struct pci_epf * epf,struct config_group * group)2585 static struct config_group *nvmet_pci_epf_add_cfs(struct pci_epf *epf,
2586 struct config_group *group)
2587 {
2588 struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf);
2589
2590 config_group_init_type_name(&nvme_epf->group, "nvme",
2591 &nvmet_pci_epf_group_type);
2592
2593 return &nvme_epf->group;
2594 }
2595
2596 static const struct pci_epf_device_id nvmet_pci_epf_ids[] = {
2597 { .name = "nvmet_pci_epf" },
2598 {},
2599 };
2600
2601 static struct pci_epf_ops nvmet_pci_epf_ops = {
2602 .bind = nvmet_pci_epf_bind,
2603 .unbind = nvmet_pci_epf_unbind,
2604 .add_cfs = nvmet_pci_epf_add_cfs,
2605 };
2606
2607 static struct pci_epf_driver nvmet_pci_epf_driver = {
2608 .driver.name = "nvmet_pci_epf",
2609 .probe = nvmet_pci_epf_probe,
2610 .id_table = nvmet_pci_epf_ids,
2611 .ops = &nvmet_pci_epf_ops,
2612 .owner = THIS_MODULE,
2613 };
2614
nvmet_pci_epf_init_module(void)2615 static int __init nvmet_pci_epf_init_module(void)
2616 {
2617 int ret;
2618
2619 ret = pci_epf_register_driver(&nvmet_pci_epf_driver);
2620 if (ret)
2621 return ret;
2622
2623 ret = nvmet_register_transport(&nvmet_pci_epf_fabrics_ops);
2624 if (ret) {
2625 pci_epf_unregister_driver(&nvmet_pci_epf_driver);
2626 return ret;
2627 }
2628
2629 return 0;
2630 }
2631
nvmet_pci_epf_cleanup_module(void)2632 static void __exit nvmet_pci_epf_cleanup_module(void)
2633 {
2634 nvmet_unregister_transport(&nvmet_pci_epf_fabrics_ops);
2635 pci_epf_unregister_driver(&nvmet_pci_epf_driver);
2636 }
2637
2638 module_init(nvmet_pci_epf_init_module);
2639 module_exit(nvmet_pci_epf_cleanup_module);
2640
2641 MODULE_DESCRIPTION("NVMe PCI Endpoint Function target driver");
2642 MODULE_AUTHOR("Damien Le Moal <[email protected]>");
2643 MODULE_LICENSE("GPL");
2644