1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <[email protected]>,
6 * Ashok Raj <[email protected]>,
7 * Shaohua Li <[email protected]>,
8 * Anil S Keshavamurthy <[email protected]>,
9 * Fenghua Yu <[email protected]>
10 * Joerg Roedel <[email protected]>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33
34 #define ROOT_SIZE VTD_PAGE_SIZE
35 #define CONTEXT_SIZE VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START (0xfee00000)
43 #define IOAPIC_RANGE_END (0xfeefffff)
44 #define IOVA_START_ADDR (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
54 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56
57 static void __init check_tylersburg_isoch(void);
58 static int rwbf_quirk;
59
60 /*
61 * set to 1 to panic kernel if can't successfully enable VT-d
62 * (used when kernel is launched w/ TXT)
63 */
64 static int force_on = 0;
65 static int intel_iommu_tboot_noforce;
66 static int no_platform_optin;
67
68 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
69
70 /*
71 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
72 * if marked present.
73 */
root_entry_lctp(struct root_entry * re)74 static phys_addr_t root_entry_lctp(struct root_entry *re)
75 {
76 if (!(re->lo & 1))
77 return 0;
78
79 return re->lo & VTD_PAGE_MASK;
80 }
81
82 /*
83 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
84 * if marked present.
85 */
root_entry_uctp(struct root_entry * re)86 static phys_addr_t root_entry_uctp(struct root_entry *re)
87 {
88 if (!(re->hi & 1))
89 return 0;
90
91 return re->hi & VTD_PAGE_MASK;
92 }
93
device_rid_cmp_key(const void * key,const struct rb_node * node)94 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
95 {
96 struct device_domain_info *info =
97 rb_entry(node, struct device_domain_info, node);
98 const u16 *rid_lhs = key;
99
100 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
101 return -1;
102
103 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
104 return 1;
105
106 return 0;
107 }
108
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)109 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
110 {
111 struct device_domain_info *info =
112 rb_entry(lhs, struct device_domain_info, node);
113 u16 key = PCI_DEVID(info->bus, info->devfn);
114
115 return device_rid_cmp_key(&key, rhs);
116 }
117
118 /*
119 * Looks up an IOMMU-probed device using its source ID.
120 *
121 * Returns the pointer to the device if there is a match. Otherwise,
122 * returns NULL.
123 *
124 * Note that this helper doesn't guarantee that the device won't be
125 * released by the iommu subsystem after being returned. The caller
126 * should use its own synchronization mechanism to avoid the device
127 * being released during its use if its possibly the case.
128 */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)129 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
130 {
131 struct device_domain_info *info = NULL;
132 struct rb_node *node;
133 unsigned long flags;
134
135 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
136 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
137 if (node)
138 info = rb_entry(node, struct device_domain_info, node);
139 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
140
141 return info ? info->dev : NULL;
142 }
143
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)144 static int device_rbtree_insert(struct intel_iommu *iommu,
145 struct device_domain_info *info)
146 {
147 struct rb_node *curr;
148 unsigned long flags;
149
150 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
151 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
152 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
153 if (WARN_ON(curr))
154 return -EEXIST;
155
156 return 0;
157 }
158
device_rbtree_remove(struct device_domain_info * info)159 static void device_rbtree_remove(struct device_domain_info *info)
160 {
161 struct intel_iommu *iommu = info->iommu;
162 unsigned long flags;
163
164 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
165 rb_erase(&info->node, &iommu->device_rbtree);
166 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
167 }
168
169 struct dmar_rmrr_unit {
170 struct list_head list; /* list of rmrr units */
171 struct acpi_dmar_header *hdr; /* ACPI header */
172 u64 base_address; /* reserved base address*/
173 u64 end_address; /* reserved end address */
174 struct dmar_dev_scope *devices; /* target devices */
175 int devices_cnt; /* target device count */
176 };
177
178 struct dmar_atsr_unit {
179 struct list_head list; /* list of ATSR units */
180 struct acpi_dmar_header *hdr; /* ACPI header */
181 struct dmar_dev_scope *devices; /* target devices */
182 int devices_cnt; /* target device count */
183 u8 include_all:1; /* include all ports */
184 };
185
186 struct dmar_satc_unit {
187 struct list_head list; /* list of SATC units */
188 struct acpi_dmar_header *hdr; /* ACPI header */
189 struct dmar_dev_scope *devices; /* target devices */
190 struct intel_iommu *iommu; /* the corresponding iommu */
191 int devices_cnt; /* target device count */
192 u8 atc_required:1; /* ATS is required */
193 };
194
195 static LIST_HEAD(dmar_atsr_units);
196 static LIST_HEAD(dmar_rmrr_units);
197 static LIST_HEAD(dmar_satc_units);
198
199 #define for_each_rmrr_units(rmrr) \
200 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
201
202 static void intel_iommu_domain_free(struct iommu_domain *domain);
203
204 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
205 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
206
207 int intel_iommu_enabled = 0;
208 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
209
210 static int intel_iommu_superpage = 1;
211 static int iommu_identity_mapping;
212 static int iommu_skip_te_disable;
213 static int disable_igfx_iommu;
214
215 #define IDENTMAP_AZALIA 4
216
217 const struct iommu_ops intel_iommu_ops;
218 static const struct iommu_dirty_ops intel_dirty_ops;
219
translation_pre_enabled(struct intel_iommu * iommu)220 static bool translation_pre_enabled(struct intel_iommu *iommu)
221 {
222 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
223 }
224
clear_translation_pre_enabled(struct intel_iommu * iommu)225 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
226 {
227 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
228 }
229
init_translation_status(struct intel_iommu * iommu)230 static void init_translation_status(struct intel_iommu *iommu)
231 {
232 u32 gsts;
233
234 gsts = readl(iommu->reg + DMAR_GSTS_REG);
235 if (gsts & DMA_GSTS_TES)
236 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
237 }
238
intel_iommu_setup(char * str)239 static int __init intel_iommu_setup(char *str)
240 {
241 if (!str)
242 return -EINVAL;
243
244 while (*str) {
245 if (!strncmp(str, "on", 2)) {
246 dmar_disabled = 0;
247 pr_info("IOMMU enabled\n");
248 } else if (!strncmp(str, "off", 3)) {
249 dmar_disabled = 1;
250 no_platform_optin = 1;
251 pr_info("IOMMU disabled\n");
252 } else if (!strncmp(str, "igfx_off", 8)) {
253 disable_igfx_iommu = 1;
254 pr_info("Disable GFX device mapping\n");
255 } else if (!strncmp(str, "forcedac", 8)) {
256 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
257 iommu_dma_forcedac = true;
258 } else if (!strncmp(str, "strict", 6)) {
259 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
260 iommu_set_dma_strict();
261 } else if (!strncmp(str, "sp_off", 6)) {
262 pr_info("Disable supported super page\n");
263 intel_iommu_superpage = 0;
264 } else if (!strncmp(str, "sm_on", 5)) {
265 pr_info("Enable scalable mode if hardware supports\n");
266 intel_iommu_sm = 1;
267 } else if (!strncmp(str, "sm_off", 6)) {
268 pr_info("Scalable mode is disallowed\n");
269 intel_iommu_sm = 0;
270 } else if (!strncmp(str, "tboot_noforce", 13)) {
271 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
272 intel_iommu_tboot_noforce = 1;
273 } else {
274 pr_notice("Unknown option - '%s'\n", str);
275 }
276
277 str += strcspn(str, ",");
278 while (*str == ',')
279 str++;
280 }
281
282 return 1;
283 }
284 __setup("intel_iommu=", intel_iommu_setup);
285
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)286 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
287 {
288 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
289
290 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
291 }
292
293 /*
294 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
295 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
296 * the returned SAGAW.
297 */
__iommu_calculate_sagaw(struct intel_iommu * iommu)298 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
299 {
300 unsigned long fl_sagaw, sl_sagaw;
301
302 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
303 sl_sagaw = cap_sagaw(iommu->cap);
304
305 /* Second level only. */
306 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
307 return sl_sagaw;
308
309 /* First level only. */
310 if (!ecap_slts(iommu->ecap))
311 return fl_sagaw;
312
313 return fl_sagaw & sl_sagaw;
314 }
315
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)316 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
317 {
318 unsigned long sagaw;
319 int agaw;
320
321 sagaw = __iommu_calculate_sagaw(iommu);
322 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
323 if (test_bit(agaw, &sagaw))
324 break;
325 }
326
327 return agaw;
328 }
329
330 /*
331 * Calculate max SAGAW for each iommu.
332 */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)333 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
334 {
335 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
336 }
337
338 /*
339 * calculate agaw for each iommu.
340 * "SAGAW" may be different across iommus, use a default agaw, and
341 * get a supported less agaw for iommus that don't support the default agaw.
342 */
iommu_calculate_agaw(struct intel_iommu * iommu)343 int iommu_calculate_agaw(struct intel_iommu *iommu)
344 {
345 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
346 }
347
iommu_paging_structure_coherency(struct intel_iommu * iommu)348 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
349 {
350 return sm_supported(iommu) ?
351 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
352 }
353
354 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)355 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
356 {
357 unsigned long bitmap = 0;
358
359 /*
360 * 1-level super page supports page size of 2MiB, 2-level super page
361 * supports page size of both 2MiB and 1GiB.
362 */
363 if (domain->iommu_superpage == 1)
364 bitmap |= SZ_2M;
365 else if (domain->iommu_superpage == 2)
366 bitmap |= SZ_2M | SZ_1G;
367
368 return bitmap;
369 }
370
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)371 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
372 u8 devfn, int alloc)
373 {
374 struct root_entry *root = &iommu->root_entry[bus];
375 struct context_entry *context;
376 u64 *entry;
377
378 /*
379 * Except that the caller requested to allocate a new entry,
380 * returning a copied context entry makes no sense.
381 */
382 if (!alloc && context_copied(iommu, bus, devfn))
383 return NULL;
384
385 entry = &root->lo;
386 if (sm_supported(iommu)) {
387 if (devfn >= 0x80) {
388 devfn -= 0x80;
389 entry = &root->hi;
390 }
391 devfn *= 2;
392 }
393 if (*entry & 1)
394 context = phys_to_virt(*entry & VTD_PAGE_MASK);
395 else {
396 unsigned long phy_addr;
397 if (!alloc)
398 return NULL;
399
400 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
401 if (!context)
402 return NULL;
403
404 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
405 phy_addr = virt_to_phys((void *)context);
406 *entry = phy_addr | 1;
407 __iommu_flush_cache(iommu, entry, sizeof(*entry));
408 }
409 return &context[devfn];
410 }
411
412 /**
413 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
414 * sub-hierarchy of a candidate PCI-PCI bridge
415 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
416 * @bridge: the candidate PCI-PCI bridge
417 *
418 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
419 */
420 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)421 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
422 {
423 struct pci_dev *pdev, *pbridge;
424
425 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
426 return false;
427
428 pdev = to_pci_dev(dev);
429 pbridge = to_pci_dev(bridge);
430
431 if (pbridge->subordinate &&
432 pbridge->subordinate->number <= pdev->bus->number &&
433 pbridge->subordinate->busn_res.end >= pdev->bus->number)
434 return true;
435
436 return false;
437 }
438
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)439 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
440 {
441 struct dmar_drhd_unit *drhd;
442 u32 vtbar;
443 int rc;
444
445 /* We know that this device on this chipset has its own IOMMU.
446 * If we find it under a different IOMMU, then the BIOS is lying
447 * to us. Hope that the IOMMU for this device is actually
448 * disabled, and it needs no translation...
449 */
450 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
451 if (rc) {
452 /* "can't" happen */
453 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
454 return false;
455 }
456 vtbar &= 0xffff0000;
457
458 /* we know that the this iommu should be at offset 0xa000 from vtbar */
459 drhd = dmar_find_matched_drhd_unit(pdev);
460 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
461 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
462 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
463 return true;
464 }
465
466 return false;
467 }
468
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)469 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
470 {
471 if (!iommu || iommu->drhd->ignored)
472 return true;
473
474 if (dev_is_pci(dev)) {
475 struct pci_dev *pdev = to_pci_dev(dev);
476
477 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
478 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
479 quirk_ioat_snb_local_iommu(pdev))
480 return true;
481 }
482
483 return false;
484 }
485
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)486 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
487 {
488 struct dmar_drhd_unit *drhd = NULL;
489 struct pci_dev *pdev = NULL;
490 struct intel_iommu *iommu;
491 struct device *tmp;
492 u16 segment = 0;
493 int i;
494
495 if (!dev)
496 return NULL;
497
498 if (dev_is_pci(dev)) {
499 struct pci_dev *pf_pdev;
500
501 pdev = pci_real_dma_dev(to_pci_dev(dev));
502
503 /* VFs aren't listed in scope tables; we need to look up
504 * the PF instead to find the IOMMU. */
505 pf_pdev = pci_physfn(pdev);
506 dev = &pf_pdev->dev;
507 segment = pci_domain_nr(pdev->bus);
508 } else if (has_acpi_companion(dev))
509 dev = &ACPI_COMPANION(dev)->dev;
510
511 rcu_read_lock();
512 for_each_iommu(iommu, drhd) {
513 if (pdev && segment != drhd->segment)
514 continue;
515
516 for_each_active_dev_scope(drhd->devices,
517 drhd->devices_cnt, i, tmp) {
518 if (tmp == dev) {
519 /* For a VF use its original BDF# not that of the PF
520 * which we used for the IOMMU lookup. Strictly speaking
521 * we could do this for all PCI devices; we only need to
522 * get the BDF# from the scope table for ACPI matches. */
523 if (pdev && pdev->is_virtfn)
524 goto got_pdev;
525
526 if (bus && devfn) {
527 *bus = drhd->devices[i].bus;
528 *devfn = drhd->devices[i].devfn;
529 }
530 goto out;
531 }
532
533 if (is_downstream_to_pci_bridge(dev, tmp))
534 goto got_pdev;
535 }
536
537 if (pdev && drhd->include_all) {
538 got_pdev:
539 if (bus && devfn) {
540 *bus = pdev->bus->number;
541 *devfn = pdev->devfn;
542 }
543 goto out;
544 }
545 }
546 iommu = NULL;
547 out:
548 if (iommu_is_dummy(iommu, dev))
549 iommu = NULL;
550
551 rcu_read_unlock();
552
553 return iommu;
554 }
555
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)556 static void domain_flush_cache(struct dmar_domain *domain,
557 void *addr, int size)
558 {
559 if (!domain->iommu_coherency)
560 clflush_cache_range(addr, size);
561 }
562
free_context_table(struct intel_iommu * iommu)563 static void free_context_table(struct intel_iommu *iommu)
564 {
565 struct context_entry *context;
566 int i;
567
568 if (!iommu->root_entry)
569 return;
570
571 for (i = 0; i < ROOT_ENTRY_NR; i++) {
572 context = iommu_context_addr(iommu, i, 0, 0);
573 if (context)
574 iommu_free_page(context);
575
576 if (!sm_supported(iommu))
577 continue;
578
579 context = iommu_context_addr(iommu, i, 0x80, 0);
580 if (context)
581 iommu_free_page(context);
582 }
583
584 iommu_free_page(iommu->root_entry);
585 iommu->root_entry = NULL;
586 }
587
588 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)589 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
590 u8 bus, u8 devfn, struct dma_pte *parent, int level)
591 {
592 struct dma_pte *pte;
593 int offset;
594
595 while (1) {
596 offset = pfn_level_offset(pfn, level);
597 pte = &parent[offset];
598
599 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
600
601 if (!dma_pte_present(pte)) {
602 pr_info("page table not present at level %d\n", level - 1);
603 break;
604 }
605
606 if (level == 1 || dma_pte_superpage(pte))
607 break;
608
609 parent = phys_to_virt(dma_pte_addr(pte));
610 level--;
611 }
612 }
613
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)614 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
615 unsigned long long addr, u32 pasid)
616 {
617 struct pasid_dir_entry *dir, *pde;
618 struct pasid_entry *entries, *pte;
619 struct context_entry *ctx_entry;
620 struct root_entry *rt_entry;
621 int i, dir_index, index, level;
622 u8 devfn = source_id & 0xff;
623 u8 bus = source_id >> 8;
624 struct dma_pte *pgtable;
625
626 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
627
628 /* root entry dump */
629 if (!iommu->root_entry) {
630 pr_info("root table is not present\n");
631 return;
632 }
633 rt_entry = &iommu->root_entry[bus];
634
635 if (sm_supported(iommu))
636 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
637 rt_entry->hi, rt_entry->lo);
638 else
639 pr_info("root entry: 0x%016llx", rt_entry->lo);
640
641 /* context entry dump */
642 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
643 if (!ctx_entry) {
644 pr_info("context table is not present\n");
645 return;
646 }
647
648 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
649 ctx_entry->hi, ctx_entry->lo);
650
651 /* legacy mode does not require PASID entries */
652 if (!sm_supported(iommu)) {
653 if (!context_present(ctx_entry)) {
654 pr_info("legacy mode page table is not present\n");
655 return;
656 }
657 level = agaw_to_level(ctx_entry->hi & 7);
658 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
659 goto pgtable_walk;
660 }
661
662 if (!context_present(ctx_entry)) {
663 pr_info("pasid directory table is not present\n");
664 return;
665 }
666
667 /* get the pointer to pasid directory entry */
668 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
669
670 /* For request-without-pasid, get the pasid from context entry */
671 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
672 pasid = IOMMU_NO_PASID;
673
674 dir_index = pasid >> PASID_PDE_SHIFT;
675 pde = &dir[dir_index];
676 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
677
678 /* get the pointer to the pasid table entry */
679 entries = get_pasid_table_from_pde(pde);
680 if (!entries) {
681 pr_info("pasid table is not present\n");
682 return;
683 }
684 index = pasid & PASID_PTE_MASK;
685 pte = &entries[index];
686 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
687 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
688
689 if (!pasid_pte_is_present(pte)) {
690 pr_info("scalable mode page table is not present\n");
691 return;
692 }
693
694 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
695 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
696 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
697 } else {
698 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
699 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
700 }
701
702 pgtable_walk:
703 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
704 }
705 #endif
706
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)707 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
708 unsigned long pfn, int *target_level,
709 gfp_t gfp)
710 {
711 struct dma_pte *parent, *pte;
712 int level = agaw_to_level(domain->agaw);
713 int offset;
714
715 if (!domain_pfn_supported(domain, pfn))
716 /* Address beyond IOMMU's addressing capabilities. */
717 return NULL;
718
719 parent = domain->pgd;
720
721 while (1) {
722 void *tmp_page;
723
724 offset = pfn_level_offset(pfn, level);
725 pte = &parent[offset];
726 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
727 break;
728 if (level == *target_level)
729 break;
730
731 if (!dma_pte_present(pte)) {
732 uint64_t pteval, tmp;
733
734 tmp_page = iommu_alloc_page_node(domain->nid, gfp);
735
736 if (!tmp_page)
737 return NULL;
738
739 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
740 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
741 if (domain->use_first_level)
742 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
743
744 tmp = 0ULL;
745 if (!try_cmpxchg64(&pte->val, &tmp, pteval))
746 /* Someone else set it while we were thinking; use theirs. */
747 iommu_free_page(tmp_page);
748 else
749 domain_flush_cache(domain, pte, sizeof(*pte));
750 }
751 if (level == 1)
752 break;
753
754 parent = phys_to_virt(dma_pte_addr(pte));
755 level--;
756 }
757
758 if (!*target_level)
759 *target_level = level;
760
761 return pte;
762 }
763
764 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)765 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
766 unsigned long pfn,
767 int level, int *large_page)
768 {
769 struct dma_pte *parent, *pte;
770 int total = agaw_to_level(domain->agaw);
771 int offset;
772
773 parent = domain->pgd;
774 while (level <= total) {
775 offset = pfn_level_offset(pfn, total);
776 pte = &parent[offset];
777 if (level == total)
778 return pte;
779
780 if (!dma_pte_present(pte)) {
781 *large_page = total;
782 break;
783 }
784
785 if (dma_pte_superpage(pte)) {
786 *large_page = total;
787 return pte;
788 }
789
790 parent = phys_to_virt(dma_pte_addr(pte));
791 total--;
792 }
793 return NULL;
794 }
795
796 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)797 static void dma_pte_clear_range(struct dmar_domain *domain,
798 unsigned long start_pfn,
799 unsigned long last_pfn)
800 {
801 unsigned int large_page;
802 struct dma_pte *first_pte, *pte;
803
804 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
805 WARN_ON(start_pfn > last_pfn))
806 return;
807
808 /* we don't need lock here; nobody else touches the iova range */
809 do {
810 large_page = 1;
811 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
812 if (!pte) {
813 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
814 continue;
815 }
816 do {
817 dma_clear_pte(pte);
818 start_pfn += lvl_to_nr_pages(large_page);
819 pte++;
820 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
821
822 domain_flush_cache(domain, first_pte,
823 (void *)pte - (void *)first_pte);
824
825 } while (start_pfn && start_pfn <= last_pfn);
826 }
827
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)828 static void dma_pte_free_level(struct dmar_domain *domain, int level,
829 int retain_level, struct dma_pte *pte,
830 unsigned long pfn, unsigned long start_pfn,
831 unsigned long last_pfn)
832 {
833 pfn = max(start_pfn, pfn);
834 pte = &pte[pfn_level_offset(pfn, level)];
835
836 do {
837 unsigned long level_pfn;
838 struct dma_pte *level_pte;
839
840 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
841 goto next;
842
843 level_pfn = pfn & level_mask(level);
844 level_pte = phys_to_virt(dma_pte_addr(pte));
845
846 if (level > 2) {
847 dma_pte_free_level(domain, level - 1, retain_level,
848 level_pte, level_pfn, start_pfn,
849 last_pfn);
850 }
851
852 /*
853 * Free the page table if we're below the level we want to
854 * retain and the range covers the entire table.
855 */
856 if (level < retain_level && !(start_pfn > level_pfn ||
857 last_pfn < level_pfn + level_size(level) - 1)) {
858 dma_clear_pte(pte);
859 domain_flush_cache(domain, pte, sizeof(*pte));
860 iommu_free_page(level_pte);
861 }
862 next:
863 pfn += level_size(level);
864 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
865 }
866
867 /*
868 * clear last level (leaf) ptes and free page table pages below the
869 * level we wish to keep intact.
870 */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)871 static void dma_pte_free_pagetable(struct dmar_domain *domain,
872 unsigned long start_pfn,
873 unsigned long last_pfn,
874 int retain_level)
875 {
876 dma_pte_clear_range(domain, start_pfn, last_pfn);
877
878 /* We don't need lock here; nobody else touches the iova range */
879 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
880 domain->pgd, 0, start_pfn, last_pfn);
881
882 /* free pgd */
883 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
884 iommu_free_page(domain->pgd);
885 domain->pgd = NULL;
886 }
887 }
888
889 /* When a page at a given level is being unlinked from its parent, we don't
890 need to *modify* it at all. All we need to do is make a list of all the
891 pages which can be freed just as soon as we've flushed the IOTLB and we
892 know the hardware page-walk will no longer touch them.
893 The 'pte' argument is the *parent* PTE, pointing to the page that is to
894 be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)895 static void dma_pte_list_pagetables(struct dmar_domain *domain,
896 int level, struct dma_pte *pte,
897 struct list_head *freelist)
898 {
899 struct page *pg;
900
901 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
902 list_add_tail(&pg->lru, freelist);
903
904 if (level == 1)
905 return;
906
907 pte = page_address(pg);
908 do {
909 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
910 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
911 pte++;
912 } while (!first_pte_in_page(pte));
913 }
914
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)915 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
916 struct dma_pte *pte, unsigned long pfn,
917 unsigned long start_pfn, unsigned long last_pfn,
918 struct list_head *freelist)
919 {
920 struct dma_pte *first_pte = NULL, *last_pte = NULL;
921
922 pfn = max(start_pfn, pfn);
923 pte = &pte[pfn_level_offset(pfn, level)];
924
925 do {
926 unsigned long level_pfn = pfn & level_mask(level);
927
928 if (!dma_pte_present(pte))
929 goto next;
930
931 /* If range covers entire pagetable, free it */
932 if (start_pfn <= level_pfn &&
933 last_pfn >= level_pfn + level_size(level) - 1) {
934 /* These suborbinate page tables are going away entirely. Don't
935 bother to clear them; we're just going to *free* them. */
936 if (level > 1 && !dma_pte_superpage(pte))
937 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
938
939 dma_clear_pte(pte);
940 if (!first_pte)
941 first_pte = pte;
942 last_pte = pte;
943 } else if (level > 1) {
944 /* Recurse down into a level that isn't *entirely* obsolete */
945 dma_pte_clear_level(domain, level - 1,
946 phys_to_virt(dma_pte_addr(pte)),
947 level_pfn, start_pfn, last_pfn,
948 freelist);
949 }
950 next:
951 pfn = level_pfn + level_size(level);
952 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
953
954 if (first_pte)
955 domain_flush_cache(domain, first_pte,
956 (void *)++last_pte - (void *)first_pte);
957 }
958
959 /* We can't just free the pages because the IOMMU may still be walking
960 the page tables, and may have cached the intermediate levels. The
961 pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)962 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
963 unsigned long last_pfn, struct list_head *freelist)
964 {
965 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
966 WARN_ON(start_pfn > last_pfn))
967 return;
968
969 /* we don't need lock here; nobody else touches the iova range */
970 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
971 domain->pgd, 0, start_pfn, last_pfn, freelist);
972
973 /* free pgd */
974 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
975 struct page *pgd_page = virt_to_page(domain->pgd);
976 list_add_tail(&pgd_page->lru, freelist);
977 domain->pgd = NULL;
978 }
979 }
980
981 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)982 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
983 {
984 struct root_entry *root;
985
986 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
987 if (!root) {
988 pr_err("Allocating root entry for %s failed\n",
989 iommu->name);
990 return -ENOMEM;
991 }
992
993 __iommu_flush_cache(iommu, root, ROOT_SIZE);
994 iommu->root_entry = root;
995
996 return 0;
997 }
998
iommu_set_root_entry(struct intel_iommu * iommu)999 static void iommu_set_root_entry(struct intel_iommu *iommu)
1000 {
1001 u64 addr;
1002 u32 sts;
1003 unsigned long flag;
1004
1005 addr = virt_to_phys(iommu->root_entry);
1006 if (sm_supported(iommu))
1007 addr |= DMA_RTADDR_SMT;
1008
1009 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1010 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1011
1012 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1013
1014 /* Make sure hardware complete it */
1015 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1016 readl, (sts & DMA_GSTS_RTPS), sts);
1017
1018 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1019
1020 /*
1021 * Hardware invalidates all DMA remapping hardware translation
1022 * caches as part of SRTP flow.
1023 */
1024 if (cap_esrtps(iommu->cap))
1025 return;
1026
1027 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1028 if (sm_supported(iommu))
1029 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1030 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1031 }
1032
iommu_flush_write_buffer(struct intel_iommu * iommu)1033 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1034 {
1035 u32 val;
1036 unsigned long flag;
1037
1038 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1039 return;
1040
1041 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1042 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1043
1044 /* Make sure hardware complete it */
1045 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1046 readl, (!(val & DMA_GSTS_WBFS)), val);
1047
1048 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1049 }
1050
1051 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1052 static void __iommu_flush_context(struct intel_iommu *iommu,
1053 u16 did, u16 source_id, u8 function_mask,
1054 u64 type)
1055 {
1056 u64 val = 0;
1057 unsigned long flag;
1058
1059 switch (type) {
1060 case DMA_CCMD_GLOBAL_INVL:
1061 val = DMA_CCMD_GLOBAL_INVL;
1062 break;
1063 case DMA_CCMD_DOMAIN_INVL:
1064 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1065 break;
1066 case DMA_CCMD_DEVICE_INVL:
1067 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1068 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1069 break;
1070 default:
1071 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1072 iommu->name, type);
1073 return;
1074 }
1075 val |= DMA_CCMD_ICC;
1076
1077 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1078 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1079
1080 /* Make sure hardware complete it */
1081 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1082 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1083
1084 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1085 }
1086
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1087 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1088 unsigned int size_order, u64 type)
1089 {
1090 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1091 u64 val = 0, val_iva = 0;
1092 unsigned long flag;
1093
1094 switch (type) {
1095 case DMA_TLB_GLOBAL_FLUSH:
1096 /* global flush doesn't need set IVA_REG */
1097 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1098 break;
1099 case DMA_TLB_DSI_FLUSH:
1100 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1101 break;
1102 case DMA_TLB_PSI_FLUSH:
1103 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1104 /* IH bit is passed in as part of address */
1105 val_iva = size_order | addr;
1106 break;
1107 default:
1108 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1109 iommu->name, type);
1110 return;
1111 }
1112
1113 if (cap_write_drain(iommu->cap))
1114 val |= DMA_TLB_WRITE_DRAIN;
1115
1116 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1117 /* Note: Only uses first TLB reg currently */
1118 if (val_iva)
1119 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1120 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1121
1122 /* Make sure hardware complete it */
1123 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1124 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1125
1126 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1127
1128 /* check IOTLB invalidation granularity */
1129 if (DMA_TLB_IAIG(val) == 0)
1130 pr_err("Flush IOTLB failed\n");
1131 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1132 pr_debug("TLB flush request %Lx, actual %Lx\n",
1133 (unsigned long long)DMA_TLB_IIRG(type),
1134 (unsigned long long)DMA_TLB_IAIG(val));
1135 }
1136
1137 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1138 domain_lookup_dev_info(struct dmar_domain *domain,
1139 struct intel_iommu *iommu, u8 bus, u8 devfn)
1140 {
1141 struct device_domain_info *info;
1142 unsigned long flags;
1143
1144 spin_lock_irqsave(&domain->lock, flags);
1145 list_for_each_entry(info, &domain->devices, link) {
1146 if (info->iommu == iommu && info->bus == bus &&
1147 info->devfn == devfn) {
1148 spin_unlock_irqrestore(&domain->lock, flags);
1149 return info;
1150 }
1151 }
1152 spin_unlock_irqrestore(&domain->lock, flags);
1153
1154 return NULL;
1155 }
1156
1157 /*
1158 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1159 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1160 * check because it applies only to the built-in QAT devices and it doesn't
1161 * grant additional privileges.
1162 */
1163 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1164 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1165 {
1166 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1167 return false;
1168
1169 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1170 return false;
1171
1172 return true;
1173 }
1174
iommu_enable_pci_caps(struct device_domain_info * info)1175 static void iommu_enable_pci_caps(struct device_domain_info *info)
1176 {
1177 struct pci_dev *pdev;
1178
1179 if (!dev_is_pci(info->dev))
1180 return;
1181
1182 pdev = to_pci_dev(info->dev);
1183 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1184 !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1185 info->ats_enabled = 1;
1186 }
1187
iommu_disable_pci_caps(struct device_domain_info * info)1188 static void iommu_disable_pci_caps(struct device_domain_info *info)
1189 {
1190 struct pci_dev *pdev;
1191
1192 if (!dev_is_pci(info->dev))
1193 return;
1194
1195 pdev = to_pci_dev(info->dev);
1196
1197 if (info->ats_enabled) {
1198 pci_disable_ats(pdev);
1199 info->ats_enabled = 0;
1200 }
1201 }
1202
intel_flush_iotlb_all(struct iommu_domain * domain)1203 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1204 {
1205 cache_tag_flush_all(to_dmar_domain(domain));
1206 }
1207
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1208 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1209 {
1210 u32 pmen;
1211 unsigned long flags;
1212
1213 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1214 return;
1215
1216 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1217 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1218 pmen &= ~DMA_PMEN_EPM;
1219 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1220
1221 /* wait for the protected region status bit to clear */
1222 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1223 readl, !(pmen & DMA_PMEN_PRS), pmen);
1224
1225 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1226 }
1227
iommu_enable_translation(struct intel_iommu * iommu)1228 static void iommu_enable_translation(struct intel_iommu *iommu)
1229 {
1230 u32 sts;
1231 unsigned long flags;
1232
1233 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1234 iommu->gcmd |= DMA_GCMD_TE;
1235 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1236
1237 /* Make sure hardware complete it */
1238 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1239 readl, (sts & DMA_GSTS_TES), sts);
1240
1241 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1242 }
1243
iommu_disable_translation(struct intel_iommu * iommu)1244 static void iommu_disable_translation(struct intel_iommu *iommu)
1245 {
1246 u32 sts;
1247 unsigned long flag;
1248
1249 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1250 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1251 return;
1252
1253 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1254 iommu->gcmd &= ~DMA_GCMD_TE;
1255 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1256
1257 /* Make sure hardware complete it */
1258 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1259 readl, (!(sts & DMA_GSTS_TES)), sts);
1260
1261 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1262 }
1263
iommu_init_domains(struct intel_iommu * iommu)1264 static int iommu_init_domains(struct intel_iommu *iommu)
1265 {
1266 u32 ndomains;
1267
1268 ndomains = cap_ndoms(iommu->cap);
1269 pr_debug("%s: Number of Domains supported <%d>\n",
1270 iommu->name, ndomains);
1271
1272 spin_lock_init(&iommu->lock);
1273
1274 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1275 if (!iommu->domain_ids)
1276 return -ENOMEM;
1277
1278 /*
1279 * If Caching mode is set, then invalid translations are tagged
1280 * with domain-id 0, hence we need to pre-allocate it. We also
1281 * use domain-id 0 as a marker for non-allocated domain-id, so
1282 * make sure it is not used for a real domain.
1283 */
1284 set_bit(0, iommu->domain_ids);
1285
1286 /*
1287 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1288 * entry for first-level or pass-through translation modes should
1289 * be programmed with a domain id different from those used for
1290 * second-level or nested translation. We reserve a domain id for
1291 * this purpose. This domain id is also used for identity domain
1292 * in legacy mode.
1293 */
1294 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1295
1296 return 0;
1297 }
1298
disable_dmar_iommu(struct intel_iommu * iommu)1299 static void disable_dmar_iommu(struct intel_iommu *iommu)
1300 {
1301 if (!iommu->domain_ids)
1302 return;
1303
1304 /*
1305 * All iommu domains must have been detached from the devices,
1306 * hence there should be no domain IDs in use.
1307 */
1308 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1309 > NUM_RESERVED_DID))
1310 return;
1311
1312 if (iommu->gcmd & DMA_GCMD_TE)
1313 iommu_disable_translation(iommu);
1314 }
1315
free_dmar_iommu(struct intel_iommu * iommu)1316 static void free_dmar_iommu(struct intel_iommu *iommu)
1317 {
1318 if (iommu->domain_ids) {
1319 bitmap_free(iommu->domain_ids);
1320 iommu->domain_ids = NULL;
1321 }
1322
1323 if (iommu->copied_tables) {
1324 bitmap_free(iommu->copied_tables);
1325 iommu->copied_tables = NULL;
1326 }
1327
1328 /* free context mapping */
1329 free_context_table(iommu);
1330
1331 if (ecap_prs(iommu->ecap))
1332 intel_iommu_finish_prq(iommu);
1333 }
1334
1335 /*
1336 * Check and return whether first level is used by default for
1337 * DMA translation.
1338 */
first_level_by_default(struct intel_iommu * iommu)1339 static bool first_level_by_default(struct intel_iommu *iommu)
1340 {
1341 /* Only SL is available in legacy mode */
1342 if (!sm_supported(iommu))
1343 return false;
1344
1345 /* Only level (either FL or SL) is available, just use it */
1346 if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1347 return ecap_flts(iommu->ecap);
1348
1349 return true;
1350 }
1351
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1352 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1353 {
1354 struct iommu_domain_info *info, *curr;
1355 unsigned long ndomains;
1356 int num, ret = -ENOSPC;
1357
1358 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1359 return 0;
1360
1361 info = kzalloc(sizeof(*info), GFP_KERNEL);
1362 if (!info)
1363 return -ENOMEM;
1364
1365 spin_lock(&iommu->lock);
1366 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1367 if (curr) {
1368 curr->refcnt++;
1369 spin_unlock(&iommu->lock);
1370 kfree(info);
1371 return 0;
1372 }
1373
1374 ndomains = cap_ndoms(iommu->cap);
1375 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1376 if (num >= ndomains) {
1377 pr_err("%s: No free domain ids\n", iommu->name);
1378 goto err_unlock;
1379 }
1380
1381 set_bit(num, iommu->domain_ids);
1382 info->refcnt = 1;
1383 info->did = num;
1384 info->iommu = iommu;
1385 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1386 NULL, info, GFP_ATOMIC);
1387 if (curr) {
1388 ret = xa_err(curr) ? : -EBUSY;
1389 goto err_clear;
1390 }
1391
1392 spin_unlock(&iommu->lock);
1393 return 0;
1394
1395 err_clear:
1396 clear_bit(info->did, iommu->domain_ids);
1397 err_unlock:
1398 spin_unlock(&iommu->lock);
1399 kfree(info);
1400 return ret;
1401 }
1402
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1403 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1404 {
1405 struct iommu_domain_info *info;
1406
1407 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1408 return;
1409
1410 spin_lock(&iommu->lock);
1411 info = xa_load(&domain->iommu_array, iommu->seq_id);
1412 if (--info->refcnt == 0) {
1413 clear_bit(info->did, iommu->domain_ids);
1414 xa_erase(&domain->iommu_array, iommu->seq_id);
1415 domain->nid = NUMA_NO_NODE;
1416 kfree(info);
1417 }
1418 spin_unlock(&iommu->lock);
1419 }
1420
domain_exit(struct dmar_domain * domain)1421 static void domain_exit(struct dmar_domain *domain)
1422 {
1423 if (domain->pgd) {
1424 LIST_HEAD(freelist);
1425
1426 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1427 iommu_put_pages_list(&freelist);
1428 }
1429
1430 if (WARN_ON(!list_empty(&domain->devices)))
1431 return;
1432
1433 kfree(domain->qi_batch);
1434 kfree(domain);
1435 }
1436
1437 /*
1438 * For kdump cases, old valid entries may be cached due to the
1439 * in-flight DMA and copied pgtable, but there is no unmapping
1440 * behaviour for them, thus we need an explicit cache flush for
1441 * the newly-mapped device. For kdump, at this point, the device
1442 * is supposed to finish reset at its driver probe stage, so no
1443 * in-flight DMA will exist, and we don't need to worry anymore
1444 * hereafter.
1445 */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1446 static void copied_context_tear_down(struct intel_iommu *iommu,
1447 struct context_entry *context,
1448 u8 bus, u8 devfn)
1449 {
1450 u16 did_old;
1451
1452 if (!context_copied(iommu, bus, devfn))
1453 return;
1454
1455 assert_spin_locked(&iommu->lock);
1456
1457 did_old = context_domain_id(context);
1458 context_clear_entry(context);
1459
1460 if (did_old < cap_ndoms(iommu->cap)) {
1461 iommu->flush.flush_context(iommu, did_old,
1462 PCI_DEVID(bus, devfn),
1463 DMA_CCMD_MASK_NOBIT,
1464 DMA_CCMD_DEVICE_INVL);
1465 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1466 DMA_TLB_DSI_FLUSH);
1467 }
1468
1469 clear_context_copied(iommu, bus, devfn);
1470 }
1471
1472 /*
1473 * It's a non-present to present mapping. If hardware doesn't cache
1474 * non-present entry we only need to flush the write-buffer. If the
1475 * _does_ cache non-present entries, then it does so in the special
1476 * domain #0, which we have to flush:
1477 */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1478 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1479 u8 bus, u8 devfn)
1480 {
1481 if (cap_caching_mode(iommu->cap)) {
1482 iommu->flush.flush_context(iommu, 0,
1483 PCI_DEVID(bus, devfn),
1484 DMA_CCMD_MASK_NOBIT,
1485 DMA_CCMD_DEVICE_INVL);
1486 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1487 } else {
1488 iommu_flush_write_buffer(iommu);
1489 }
1490 }
1491
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1492 static int domain_context_mapping_one(struct dmar_domain *domain,
1493 struct intel_iommu *iommu,
1494 u8 bus, u8 devfn)
1495 {
1496 struct device_domain_info *info =
1497 domain_lookup_dev_info(domain, iommu, bus, devfn);
1498 u16 did = domain_id_iommu(domain, iommu);
1499 int translation = CONTEXT_TT_MULTI_LEVEL;
1500 struct dma_pte *pgd = domain->pgd;
1501 struct context_entry *context;
1502 int ret;
1503
1504 pr_debug("Set context mapping for %02x:%02x.%d\n",
1505 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1506
1507 spin_lock(&iommu->lock);
1508 ret = -ENOMEM;
1509 context = iommu_context_addr(iommu, bus, devfn, 1);
1510 if (!context)
1511 goto out_unlock;
1512
1513 ret = 0;
1514 if (context_present(context) && !context_copied(iommu, bus, devfn))
1515 goto out_unlock;
1516
1517 copied_context_tear_down(iommu, context, bus, devfn);
1518 context_clear_entry(context);
1519 context_set_domain_id(context, did);
1520
1521 if (info && info->ats_supported)
1522 translation = CONTEXT_TT_DEV_IOTLB;
1523 else
1524 translation = CONTEXT_TT_MULTI_LEVEL;
1525
1526 context_set_address_root(context, virt_to_phys(pgd));
1527 context_set_address_width(context, domain->agaw);
1528 context_set_translation_type(context, translation);
1529 context_set_fault_enable(context);
1530 context_set_present(context);
1531 if (!ecap_coherent(iommu->ecap))
1532 clflush_cache_range(context, sizeof(*context));
1533 context_present_cache_flush(iommu, did, bus, devfn);
1534 ret = 0;
1535
1536 out_unlock:
1537 spin_unlock(&iommu->lock);
1538
1539 return ret;
1540 }
1541
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1542 static int domain_context_mapping_cb(struct pci_dev *pdev,
1543 u16 alias, void *opaque)
1544 {
1545 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1546 struct intel_iommu *iommu = info->iommu;
1547 struct dmar_domain *domain = opaque;
1548
1549 return domain_context_mapping_one(domain, iommu,
1550 PCI_BUS_NUM(alias), alias & 0xff);
1551 }
1552
1553 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1554 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1555 {
1556 struct device_domain_info *info = dev_iommu_priv_get(dev);
1557 struct intel_iommu *iommu = info->iommu;
1558 u8 bus = info->bus, devfn = info->devfn;
1559
1560 if (!dev_is_pci(dev))
1561 return domain_context_mapping_one(domain, iommu, bus, devfn);
1562
1563 return pci_for_each_dma_alias(to_pci_dev(dev),
1564 domain_context_mapping_cb, domain);
1565 }
1566
1567 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1568 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1569 unsigned long phy_pfn, unsigned long pages)
1570 {
1571 int support, level = 1;
1572 unsigned long pfnmerge;
1573
1574 support = domain->iommu_superpage;
1575
1576 /* To use a large page, the virtual *and* physical addresses
1577 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1578 of them will mean we have to use smaller pages. So just
1579 merge them and check both at once. */
1580 pfnmerge = iov_pfn | phy_pfn;
1581
1582 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1583 pages >>= VTD_STRIDE_SHIFT;
1584 if (!pages)
1585 break;
1586 pfnmerge >>= VTD_STRIDE_SHIFT;
1587 level++;
1588 support--;
1589 }
1590 return level;
1591 }
1592
1593 /*
1594 * Ensure that old small page tables are removed to make room for superpage(s).
1595 * We're going to add new large pages, so make sure we don't remove their parent
1596 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1597 */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1598 static void switch_to_super_page(struct dmar_domain *domain,
1599 unsigned long start_pfn,
1600 unsigned long end_pfn, int level)
1601 {
1602 unsigned long lvl_pages = lvl_to_nr_pages(level);
1603 struct dma_pte *pte = NULL;
1604
1605 while (start_pfn <= end_pfn) {
1606 if (!pte)
1607 pte = pfn_to_dma_pte(domain, start_pfn, &level,
1608 GFP_ATOMIC);
1609
1610 if (dma_pte_present(pte)) {
1611 dma_pte_free_pagetable(domain, start_pfn,
1612 start_pfn + lvl_pages - 1,
1613 level + 1);
1614
1615 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1616 end_pfn << VTD_PAGE_SHIFT, 0);
1617 }
1618
1619 pte++;
1620 start_pfn += lvl_pages;
1621 if (first_pte_in_page(pte))
1622 pte = NULL;
1623 }
1624 }
1625
1626 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1627 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1628 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1629 gfp_t gfp)
1630 {
1631 struct dma_pte *first_pte = NULL, *pte = NULL;
1632 unsigned int largepage_lvl = 0;
1633 unsigned long lvl_pages = 0;
1634 phys_addr_t pteval;
1635 u64 attr;
1636
1637 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1638 return -EINVAL;
1639
1640 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1641 return -EINVAL;
1642
1643 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1644 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1645 return -EINVAL;
1646 }
1647
1648 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1649 attr |= DMA_FL_PTE_PRESENT;
1650 if (domain->use_first_level) {
1651 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1652 if (prot & DMA_PTE_WRITE)
1653 attr |= DMA_FL_PTE_DIRTY;
1654 }
1655
1656 domain->has_mappings = true;
1657
1658 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1659
1660 while (nr_pages > 0) {
1661 uint64_t tmp;
1662
1663 if (!pte) {
1664 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1665 phys_pfn, nr_pages);
1666
1667 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1668 gfp);
1669 if (!pte)
1670 return -ENOMEM;
1671 first_pte = pte;
1672
1673 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1674
1675 /* It is large page*/
1676 if (largepage_lvl > 1) {
1677 unsigned long end_pfn;
1678 unsigned long pages_to_remove;
1679
1680 pteval |= DMA_PTE_LARGE_PAGE;
1681 pages_to_remove = min_t(unsigned long, nr_pages,
1682 nr_pte_to_next_page(pte) * lvl_pages);
1683 end_pfn = iov_pfn + pages_to_remove - 1;
1684 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1685 } else {
1686 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1687 }
1688
1689 }
1690 /* We don't need lock here, nobody else
1691 * touches the iova range
1692 */
1693 tmp = 0ULL;
1694 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1695 static int dumps = 5;
1696 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1697 iov_pfn, tmp, (unsigned long long)pteval);
1698 if (dumps) {
1699 dumps--;
1700 debug_dma_dump_mappings(NULL);
1701 }
1702 WARN_ON(1);
1703 }
1704
1705 nr_pages -= lvl_pages;
1706 iov_pfn += lvl_pages;
1707 phys_pfn += lvl_pages;
1708 pteval += lvl_pages * VTD_PAGE_SIZE;
1709
1710 /* If the next PTE would be the first in a new page, then we
1711 * need to flush the cache on the entries we've just written.
1712 * And then we'll need to recalculate 'pte', so clear it and
1713 * let it get set again in the if (!pte) block above.
1714 *
1715 * If we're done (!nr_pages) we need to flush the cache too.
1716 *
1717 * Also if we've been setting superpages, we may need to
1718 * recalculate 'pte' and switch back to smaller pages for the
1719 * end of the mapping, if the trailing size is not enough to
1720 * use another superpage (i.e. nr_pages < lvl_pages).
1721 */
1722 pte++;
1723 if (!nr_pages || first_pte_in_page(pte) ||
1724 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1725 domain_flush_cache(domain, first_pte,
1726 (void *)pte - (void *)first_pte);
1727 pte = NULL;
1728 }
1729 }
1730
1731 return 0;
1732 }
1733
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1734 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1735 {
1736 struct intel_iommu *iommu = info->iommu;
1737 struct context_entry *context;
1738 u16 did;
1739
1740 spin_lock(&iommu->lock);
1741 context = iommu_context_addr(iommu, bus, devfn, 0);
1742 if (!context) {
1743 spin_unlock(&iommu->lock);
1744 return;
1745 }
1746
1747 did = context_domain_id(context);
1748 context_clear_entry(context);
1749 __iommu_flush_cache(iommu, context, sizeof(*context));
1750 spin_unlock(&iommu->lock);
1751 intel_context_flush_present(info, context, did, true);
1752 }
1753
__domain_setup_first_level(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,u16 did,pgd_t * pgd,int flags,struct iommu_domain * old)1754 int __domain_setup_first_level(struct intel_iommu *iommu,
1755 struct device *dev, ioasid_t pasid,
1756 u16 did, pgd_t *pgd, int flags,
1757 struct iommu_domain *old)
1758 {
1759 if (!old)
1760 return intel_pasid_setup_first_level(iommu, dev, pgd,
1761 pasid, did, flags);
1762 return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1763 iommu_domain_did(old, iommu),
1764 flags);
1765 }
1766
domain_setup_second_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1767 static int domain_setup_second_level(struct intel_iommu *iommu,
1768 struct dmar_domain *domain,
1769 struct device *dev, ioasid_t pasid,
1770 struct iommu_domain *old)
1771 {
1772 if (!old)
1773 return intel_pasid_setup_second_level(iommu, domain,
1774 dev, pasid);
1775 return intel_pasid_replace_second_level(iommu, domain, dev,
1776 iommu_domain_did(old, iommu),
1777 pasid);
1778 }
1779
domain_setup_passthrough(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1780 static int domain_setup_passthrough(struct intel_iommu *iommu,
1781 struct device *dev, ioasid_t pasid,
1782 struct iommu_domain *old)
1783 {
1784 if (!old)
1785 return intel_pasid_setup_pass_through(iommu, dev, pasid);
1786 return intel_pasid_replace_pass_through(iommu, dev,
1787 iommu_domain_did(old, iommu),
1788 pasid);
1789 }
1790
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid,struct iommu_domain * old)1791 static int domain_setup_first_level(struct intel_iommu *iommu,
1792 struct dmar_domain *domain,
1793 struct device *dev,
1794 u32 pasid, struct iommu_domain *old)
1795 {
1796 struct dma_pte *pgd = domain->pgd;
1797 int level, flags = 0;
1798
1799 level = agaw_to_level(domain->agaw);
1800 if (level != 4 && level != 5)
1801 return -EINVAL;
1802
1803 if (level == 5)
1804 flags |= PASID_FLAG_FL5LP;
1805
1806 if (domain->force_snooping)
1807 flags |= PASID_FLAG_PAGE_SNOOP;
1808
1809 return __domain_setup_first_level(iommu, dev, pasid,
1810 domain_id_iommu(domain, iommu),
1811 (pgd_t *)pgd, flags, old);
1812 }
1813
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1814 static int dmar_domain_attach_device(struct dmar_domain *domain,
1815 struct device *dev)
1816 {
1817 struct device_domain_info *info = dev_iommu_priv_get(dev);
1818 struct intel_iommu *iommu = info->iommu;
1819 unsigned long flags;
1820 int ret;
1821
1822 ret = domain_attach_iommu(domain, iommu);
1823 if (ret)
1824 return ret;
1825
1826 info->domain = domain;
1827 spin_lock_irqsave(&domain->lock, flags);
1828 list_add(&info->link, &domain->devices);
1829 spin_unlock_irqrestore(&domain->lock, flags);
1830
1831 if (dev_is_real_dma_subdevice(dev))
1832 return 0;
1833
1834 if (!sm_supported(iommu))
1835 ret = domain_context_mapping(domain, dev);
1836 else if (domain->use_first_level)
1837 ret = domain_setup_first_level(iommu, domain, dev,
1838 IOMMU_NO_PASID, NULL);
1839 else
1840 ret = domain_setup_second_level(iommu, domain, dev,
1841 IOMMU_NO_PASID, NULL);
1842
1843 if (ret)
1844 goto out_block_translation;
1845
1846 iommu_enable_pci_caps(info);
1847
1848 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1849 if (ret)
1850 goto out_block_translation;
1851
1852 return 0;
1853
1854 out_block_translation:
1855 device_block_translation(dev);
1856 return ret;
1857 }
1858
1859 /**
1860 * device_rmrr_is_relaxable - Test whether the RMRR of this device
1861 * is relaxable (ie. is allowed to be not enforced under some conditions)
1862 * @dev: device handle
1863 *
1864 * We assume that PCI USB devices with RMRRs have them largely
1865 * for historical reasons and that the RMRR space is not actively used post
1866 * boot. This exclusion may change if vendors begin to abuse it.
1867 *
1868 * The same exception is made for graphics devices, with the requirement that
1869 * any use of the RMRR regions will be torn down before assigning the device
1870 * to a guest.
1871 *
1872 * Return: true if the RMRR is relaxable, false otherwise
1873 */
device_rmrr_is_relaxable(struct device * dev)1874 static bool device_rmrr_is_relaxable(struct device *dev)
1875 {
1876 struct pci_dev *pdev;
1877
1878 if (!dev_is_pci(dev))
1879 return false;
1880
1881 pdev = to_pci_dev(dev);
1882 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1883 return true;
1884 else
1885 return false;
1886 }
1887
device_def_domain_type(struct device * dev)1888 static int device_def_domain_type(struct device *dev)
1889 {
1890 struct device_domain_info *info = dev_iommu_priv_get(dev);
1891 struct intel_iommu *iommu = info->iommu;
1892
1893 /*
1894 * Hardware does not support the passthrough translation mode.
1895 * Always use a dynamaic mapping domain.
1896 */
1897 if (!ecap_pass_through(iommu->ecap))
1898 return IOMMU_DOMAIN_DMA;
1899
1900 if (dev_is_pci(dev)) {
1901 struct pci_dev *pdev = to_pci_dev(dev);
1902
1903 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1904 return IOMMU_DOMAIN_IDENTITY;
1905 }
1906
1907 return 0;
1908 }
1909
intel_iommu_init_qi(struct intel_iommu * iommu)1910 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1911 {
1912 /*
1913 * Start from the sane iommu hardware state.
1914 * If the queued invalidation is already initialized by us
1915 * (for example, while enabling interrupt-remapping) then
1916 * we got the things already rolling from a sane state.
1917 */
1918 if (!iommu->qi) {
1919 /*
1920 * Clear any previous faults.
1921 */
1922 dmar_fault(-1, iommu);
1923 /*
1924 * Disable queued invalidation if supported and already enabled
1925 * before OS handover.
1926 */
1927 dmar_disable_qi(iommu);
1928 }
1929
1930 if (dmar_enable_qi(iommu)) {
1931 /*
1932 * Queued Invalidate not enabled, use Register Based Invalidate
1933 */
1934 iommu->flush.flush_context = __iommu_flush_context;
1935 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1936 pr_info("%s: Using Register based invalidation\n",
1937 iommu->name);
1938 } else {
1939 iommu->flush.flush_context = qi_flush_context;
1940 iommu->flush.flush_iotlb = qi_flush_iotlb;
1941 pr_info("%s: Using Queued invalidation\n", iommu->name);
1942 }
1943 }
1944
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)1945 static int copy_context_table(struct intel_iommu *iommu,
1946 struct root_entry *old_re,
1947 struct context_entry **tbl,
1948 int bus, bool ext)
1949 {
1950 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1951 struct context_entry *new_ce = NULL, ce;
1952 struct context_entry *old_ce = NULL;
1953 struct root_entry re;
1954 phys_addr_t old_ce_phys;
1955
1956 tbl_idx = ext ? bus * 2 : bus;
1957 memcpy(&re, old_re, sizeof(re));
1958
1959 for (devfn = 0; devfn < 256; devfn++) {
1960 /* First calculate the correct index */
1961 idx = (ext ? devfn * 2 : devfn) % 256;
1962
1963 if (idx == 0) {
1964 /* First save what we may have and clean up */
1965 if (new_ce) {
1966 tbl[tbl_idx] = new_ce;
1967 __iommu_flush_cache(iommu, new_ce,
1968 VTD_PAGE_SIZE);
1969 pos = 1;
1970 }
1971
1972 if (old_ce)
1973 memunmap(old_ce);
1974
1975 ret = 0;
1976 if (devfn < 0x80)
1977 old_ce_phys = root_entry_lctp(&re);
1978 else
1979 old_ce_phys = root_entry_uctp(&re);
1980
1981 if (!old_ce_phys) {
1982 if (ext && devfn == 0) {
1983 /* No LCTP, try UCTP */
1984 devfn = 0x7f;
1985 continue;
1986 } else {
1987 goto out;
1988 }
1989 }
1990
1991 ret = -ENOMEM;
1992 old_ce = memremap(old_ce_phys, PAGE_SIZE,
1993 MEMREMAP_WB);
1994 if (!old_ce)
1995 goto out;
1996
1997 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
1998 if (!new_ce)
1999 goto out_unmap;
2000
2001 ret = 0;
2002 }
2003
2004 /* Now copy the context entry */
2005 memcpy(&ce, old_ce + idx, sizeof(ce));
2006
2007 if (!context_present(&ce))
2008 continue;
2009
2010 did = context_domain_id(&ce);
2011 if (did >= 0 && did < cap_ndoms(iommu->cap))
2012 set_bit(did, iommu->domain_ids);
2013
2014 set_context_copied(iommu, bus, devfn);
2015 new_ce[idx] = ce;
2016 }
2017
2018 tbl[tbl_idx + pos] = new_ce;
2019
2020 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2021
2022 out_unmap:
2023 memunmap(old_ce);
2024
2025 out:
2026 return ret;
2027 }
2028
copy_translation_tables(struct intel_iommu * iommu)2029 static int copy_translation_tables(struct intel_iommu *iommu)
2030 {
2031 struct context_entry **ctxt_tbls;
2032 struct root_entry *old_rt;
2033 phys_addr_t old_rt_phys;
2034 int ctxt_table_entries;
2035 u64 rtaddr_reg;
2036 int bus, ret;
2037 bool new_ext, ext;
2038
2039 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2040 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2041 new_ext = !!sm_supported(iommu);
2042
2043 /*
2044 * The RTT bit can only be changed when translation is disabled,
2045 * but disabling translation means to open a window for data
2046 * corruption. So bail out and don't copy anything if we would
2047 * have to change the bit.
2048 */
2049 if (new_ext != ext)
2050 return -EINVAL;
2051
2052 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2053 if (!iommu->copied_tables)
2054 return -ENOMEM;
2055
2056 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2057 if (!old_rt_phys)
2058 return -EINVAL;
2059
2060 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2061 if (!old_rt)
2062 return -ENOMEM;
2063
2064 /* This is too big for the stack - allocate it from slab */
2065 ctxt_table_entries = ext ? 512 : 256;
2066 ret = -ENOMEM;
2067 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2068 if (!ctxt_tbls)
2069 goto out_unmap;
2070
2071 for (bus = 0; bus < 256; bus++) {
2072 ret = copy_context_table(iommu, &old_rt[bus],
2073 ctxt_tbls, bus, ext);
2074 if (ret) {
2075 pr_err("%s: Failed to copy context table for bus %d\n",
2076 iommu->name, bus);
2077 continue;
2078 }
2079 }
2080
2081 spin_lock(&iommu->lock);
2082
2083 /* Context tables are copied, now write them to the root_entry table */
2084 for (bus = 0; bus < 256; bus++) {
2085 int idx = ext ? bus * 2 : bus;
2086 u64 val;
2087
2088 if (ctxt_tbls[idx]) {
2089 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2090 iommu->root_entry[bus].lo = val;
2091 }
2092
2093 if (!ext || !ctxt_tbls[idx + 1])
2094 continue;
2095
2096 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2097 iommu->root_entry[bus].hi = val;
2098 }
2099
2100 spin_unlock(&iommu->lock);
2101
2102 kfree(ctxt_tbls);
2103
2104 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2105
2106 ret = 0;
2107
2108 out_unmap:
2109 memunmap(old_rt);
2110
2111 return ret;
2112 }
2113
init_dmars(void)2114 static int __init init_dmars(void)
2115 {
2116 struct dmar_drhd_unit *drhd;
2117 struct intel_iommu *iommu;
2118 int ret;
2119
2120 for_each_iommu(iommu, drhd) {
2121 if (drhd->ignored) {
2122 iommu_disable_translation(iommu);
2123 continue;
2124 }
2125
2126 /*
2127 * Find the max pasid size of all IOMMU's in the system.
2128 * We need to ensure the system pasid table is no bigger
2129 * than the smallest supported.
2130 */
2131 if (pasid_supported(iommu)) {
2132 u32 temp = 2 << ecap_pss(iommu->ecap);
2133
2134 intel_pasid_max_id = min_t(u32, temp,
2135 intel_pasid_max_id);
2136 }
2137
2138 intel_iommu_init_qi(iommu);
2139
2140 ret = iommu_init_domains(iommu);
2141 if (ret)
2142 goto free_iommu;
2143
2144 init_translation_status(iommu);
2145
2146 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2147 iommu_disable_translation(iommu);
2148 clear_translation_pre_enabled(iommu);
2149 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2150 iommu->name);
2151 }
2152
2153 /*
2154 * TBD:
2155 * we could share the same root & context tables
2156 * among all IOMMU's. Need to Split it later.
2157 */
2158 ret = iommu_alloc_root_entry(iommu);
2159 if (ret)
2160 goto free_iommu;
2161
2162 if (translation_pre_enabled(iommu)) {
2163 pr_info("Translation already enabled - trying to copy translation structures\n");
2164
2165 ret = copy_translation_tables(iommu);
2166 if (ret) {
2167 /*
2168 * We found the IOMMU with translation
2169 * enabled - but failed to copy over the
2170 * old root-entry table. Try to proceed
2171 * by disabling translation now and
2172 * allocating a clean root-entry table.
2173 * This might cause DMAR faults, but
2174 * probably the dump will still succeed.
2175 */
2176 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2177 iommu->name);
2178 iommu_disable_translation(iommu);
2179 clear_translation_pre_enabled(iommu);
2180 } else {
2181 pr_info("Copied translation tables from previous kernel for %s\n",
2182 iommu->name);
2183 }
2184 }
2185
2186 intel_svm_check(iommu);
2187 }
2188
2189 /*
2190 * Now that qi is enabled on all iommus, set the root entry and flush
2191 * caches. This is required on some Intel X58 chipsets, otherwise the
2192 * flush_context function will loop forever and the boot hangs.
2193 */
2194 for_each_active_iommu(iommu, drhd) {
2195 iommu_flush_write_buffer(iommu);
2196 iommu_set_root_entry(iommu);
2197 }
2198
2199 check_tylersburg_isoch();
2200
2201 /*
2202 * for each drhd
2203 * enable fault log
2204 * global invalidate context cache
2205 * global invalidate iotlb
2206 * enable translation
2207 */
2208 for_each_iommu(iommu, drhd) {
2209 if (drhd->ignored) {
2210 /*
2211 * we always have to disable PMRs or DMA may fail on
2212 * this device
2213 */
2214 if (force_on)
2215 iommu_disable_protect_mem_regions(iommu);
2216 continue;
2217 }
2218
2219 iommu_flush_write_buffer(iommu);
2220
2221 if (ecap_prs(iommu->ecap)) {
2222 /*
2223 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2224 * could cause possible lock race condition.
2225 */
2226 up_write(&dmar_global_lock);
2227 ret = intel_iommu_enable_prq(iommu);
2228 down_write(&dmar_global_lock);
2229 if (ret)
2230 goto free_iommu;
2231 }
2232
2233 ret = dmar_set_interrupt(iommu);
2234 if (ret)
2235 goto free_iommu;
2236 }
2237
2238 return 0;
2239
2240 free_iommu:
2241 for_each_active_iommu(iommu, drhd) {
2242 disable_dmar_iommu(iommu);
2243 free_dmar_iommu(iommu);
2244 }
2245
2246 return ret;
2247 }
2248
init_no_remapping_devices(void)2249 static void __init init_no_remapping_devices(void)
2250 {
2251 struct dmar_drhd_unit *drhd;
2252 struct device *dev;
2253 int i;
2254
2255 for_each_drhd_unit(drhd) {
2256 if (!drhd->include_all) {
2257 for_each_active_dev_scope(drhd->devices,
2258 drhd->devices_cnt, i, dev)
2259 break;
2260 /* ignore DMAR unit if no devices exist */
2261 if (i == drhd->devices_cnt)
2262 drhd->ignored = 1;
2263 }
2264 }
2265
2266 for_each_active_drhd_unit(drhd) {
2267 if (drhd->include_all)
2268 continue;
2269
2270 for_each_active_dev_scope(drhd->devices,
2271 drhd->devices_cnt, i, dev)
2272 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2273 break;
2274 if (i < drhd->devices_cnt)
2275 continue;
2276
2277 /* This IOMMU has *only* gfx devices. Either bypass it or
2278 set the gfx_mapped flag, as appropriate */
2279 drhd->gfx_dedicated = 1;
2280 if (disable_igfx_iommu)
2281 drhd->ignored = 1;
2282 }
2283 }
2284
2285 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2286 static int init_iommu_hw(void)
2287 {
2288 struct dmar_drhd_unit *drhd;
2289 struct intel_iommu *iommu = NULL;
2290 int ret;
2291
2292 for_each_active_iommu(iommu, drhd) {
2293 if (iommu->qi) {
2294 ret = dmar_reenable_qi(iommu);
2295 if (ret)
2296 return ret;
2297 }
2298 }
2299
2300 for_each_iommu(iommu, drhd) {
2301 if (drhd->ignored) {
2302 /*
2303 * we always have to disable PMRs or DMA may fail on
2304 * this device
2305 */
2306 if (force_on)
2307 iommu_disable_protect_mem_regions(iommu);
2308 continue;
2309 }
2310
2311 iommu_flush_write_buffer(iommu);
2312 iommu_set_root_entry(iommu);
2313 iommu_enable_translation(iommu);
2314 iommu_disable_protect_mem_regions(iommu);
2315 }
2316
2317 return 0;
2318 }
2319
iommu_flush_all(void)2320 static void iommu_flush_all(void)
2321 {
2322 struct dmar_drhd_unit *drhd;
2323 struct intel_iommu *iommu;
2324
2325 for_each_active_iommu(iommu, drhd) {
2326 iommu->flush.flush_context(iommu, 0, 0, 0,
2327 DMA_CCMD_GLOBAL_INVL);
2328 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2329 DMA_TLB_GLOBAL_FLUSH);
2330 }
2331 }
2332
iommu_suspend(void)2333 static int iommu_suspend(void)
2334 {
2335 struct dmar_drhd_unit *drhd;
2336 struct intel_iommu *iommu = NULL;
2337 unsigned long flag;
2338
2339 iommu_flush_all();
2340
2341 for_each_active_iommu(iommu, drhd) {
2342 iommu_disable_translation(iommu);
2343
2344 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2345
2346 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2347 readl(iommu->reg + DMAR_FECTL_REG);
2348 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2349 readl(iommu->reg + DMAR_FEDATA_REG);
2350 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2351 readl(iommu->reg + DMAR_FEADDR_REG);
2352 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2353 readl(iommu->reg + DMAR_FEUADDR_REG);
2354
2355 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2356 }
2357 return 0;
2358 }
2359
iommu_resume(void)2360 static void iommu_resume(void)
2361 {
2362 struct dmar_drhd_unit *drhd;
2363 struct intel_iommu *iommu = NULL;
2364 unsigned long flag;
2365
2366 if (init_iommu_hw()) {
2367 if (force_on)
2368 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2369 else
2370 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2371 return;
2372 }
2373
2374 for_each_active_iommu(iommu, drhd) {
2375
2376 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2377
2378 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2379 iommu->reg + DMAR_FECTL_REG);
2380 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2381 iommu->reg + DMAR_FEDATA_REG);
2382 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2383 iommu->reg + DMAR_FEADDR_REG);
2384 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2385 iommu->reg + DMAR_FEUADDR_REG);
2386
2387 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2388 }
2389 }
2390
2391 static struct syscore_ops iommu_syscore_ops = {
2392 .resume = iommu_resume,
2393 .suspend = iommu_suspend,
2394 };
2395
init_iommu_pm_ops(void)2396 static void __init init_iommu_pm_ops(void)
2397 {
2398 register_syscore_ops(&iommu_syscore_ops);
2399 }
2400
2401 #else
init_iommu_pm_ops(void)2402 static inline void init_iommu_pm_ops(void) {}
2403 #endif /* CONFIG_PM */
2404
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2405 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2406 {
2407 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2408 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2409 rmrr->end_address <= rmrr->base_address ||
2410 arch_rmrr_sanity_check(rmrr))
2411 return -EINVAL;
2412
2413 return 0;
2414 }
2415
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2416 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2417 {
2418 struct acpi_dmar_reserved_memory *rmrr;
2419 struct dmar_rmrr_unit *rmrru;
2420
2421 rmrr = (struct acpi_dmar_reserved_memory *)header;
2422 if (rmrr_sanity_check(rmrr)) {
2423 pr_warn(FW_BUG
2424 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2425 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2426 rmrr->base_address, rmrr->end_address,
2427 dmi_get_system_info(DMI_BIOS_VENDOR),
2428 dmi_get_system_info(DMI_BIOS_VERSION),
2429 dmi_get_system_info(DMI_PRODUCT_VERSION));
2430 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2431 }
2432
2433 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2434 if (!rmrru)
2435 goto out;
2436
2437 rmrru->hdr = header;
2438
2439 rmrru->base_address = rmrr->base_address;
2440 rmrru->end_address = rmrr->end_address;
2441
2442 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2443 ((void *)rmrr) + rmrr->header.length,
2444 &rmrru->devices_cnt);
2445 if (rmrru->devices_cnt && rmrru->devices == NULL)
2446 goto free_rmrru;
2447
2448 list_add(&rmrru->list, &dmar_rmrr_units);
2449
2450 return 0;
2451 free_rmrru:
2452 kfree(rmrru);
2453 out:
2454 return -ENOMEM;
2455 }
2456
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2457 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2458 {
2459 struct dmar_atsr_unit *atsru;
2460 struct acpi_dmar_atsr *tmp;
2461
2462 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2463 dmar_rcu_check()) {
2464 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2465 if (atsr->segment != tmp->segment)
2466 continue;
2467 if (atsr->header.length != tmp->header.length)
2468 continue;
2469 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2470 return atsru;
2471 }
2472
2473 return NULL;
2474 }
2475
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2476 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2477 {
2478 struct acpi_dmar_atsr *atsr;
2479 struct dmar_atsr_unit *atsru;
2480
2481 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2482 return 0;
2483
2484 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2485 atsru = dmar_find_atsr(atsr);
2486 if (atsru)
2487 return 0;
2488
2489 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2490 if (!atsru)
2491 return -ENOMEM;
2492
2493 /*
2494 * If memory is allocated from slab by ACPI _DSM method, we need to
2495 * copy the memory content because the memory buffer will be freed
2496 * on return.
2497 */
2498 atsru->hdr = (void *)(atsru + 1);
2499 memcpy(atsru->hdr, hdr, hdr->length);
2500 atsru->include_all = atsr->flags & 0x1;
2501 if (!atsru->include_all) {
2502 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2503 (void *)atsr + atsr->header.length,
2504 &atsru->devices_cnt);
2505 if (atsru->devices_cnt && atsru->devices == NULL) {
2506 kfree(atsru);
2507 return -ENOMEM;
2508 }
2509 }
2510
2511 list_add_rcu(&atsru->list, &dmar_atsr_units);
2512
2513 return 0;
2514 }
2515
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2516 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2517 {
2518 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2519 kfree(atsru);
2520 }
2521
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2522 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2523 {
2524 struct acpi_dmar_atsr *atsr;
2525 struct dmar_atsr_unit *atsru;
2526
2527 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2528 atsru = dmar_find_atsr(atsr);
2529 if (atsru) {
2530 list_del_rcu(&atsru->list);
2531 synchronize_rcu();
2532 intel_iommu_free_atsr(atsru);
2533 }
2534
2535 return 0;
2536 }
2537
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2538 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2539 {
2540 int i;
2541 struct device *dev;
2542 struct acpi_dmar_atsr *atsr;
2543 struct dmar_atsr_unit *atsru;
2544
2545 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2546 atsru = dmar_find_atsr(atsr);
2547 if (!atsru)
2548 return 0;
2549
2550 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2551 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2552 i, dev)
2553 return -EBUSY;
2554 }
2555
2556 return 0;
2557 }
2558
dmar_find_satc(struct acpi_dmar_satc * satc)2559 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2560 {
2561 struct dmar_satc_unit *satcu;
2562 struct acpi_dmar_satc *tmp;
2563
2564 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2565 dmar_rcu_check()) {
2566 tmp = (struct acpi_dmar_satc *)satcu->hdr;
2567 if (satc->segment != tmp->segment)
2568 continue;
2569 if (satc->header.length != tmp->header.length)
2570 continue;
2571 if (memcmp(satc, tmp, satc->header.length) == 0)
2572 return satcu;
2573 }
2574
2575 return NULL;
2576 }
2577
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2578 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2579 {
2580 struct acpi_dmar_satc *satc;
2581 struct dmar_satc_unit *satcu;
2582
2583 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2584 return 0;
2585
2586 satc = container_of(hdr, struct acpi_dmar_satc, header);
2587 satcu = dmar_find_satc(satc);
2588 if (satcu)
2589 return 0;
2590
2591 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2592 if (!satcu)
2593 return -ENOMEM;
2594
2595 satcu->hdr = (void *)(satcu + 1);
2596 memcpy(satcu->hdr, hdr, hdr->length);
2597 satcu->atc_required = satc->flags & 0x1;
2598 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2599 (void *)satc + satc->header.length,
2600 &satcu->devices_cnt);
2601 if (satcu->devices_cnt && !satcu->devices) {
2602 kfree(satcu);
2603 return -ENOMEM;
2604 }
2605 list_add_rcu(&satcu->list, &dmar_satc_units);
2606
2607 return 0;
2608 }
2609
intel_iommu_add(struct dmar_drhd_unit * dmaru)2610 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2611 {
2612 struct intel_iommu *iommu = dmaru->iommu;
2613 int ret;
2614
2615 /*
2616 * Disable translation if already enabled prior to OS handover.
2617 */
2618 if (iommu->gcmd & DMA_GCMD_TE)
2619 iommu_disable_translation(iommu);
2620
2621 ret = iommu_init_domains(iommu);
2622 if (ret == 0)
2623 ret = iommu_alloc_root_entry(iommu);
2624 if (ret)
2625 goto out;
2626
2627 intel_svm_check(iommu);
2628
2629 if (dmaru->ignored) {
2630 /*
2631 * we always have to disable PMRs or DMA may fail on this device
2632 */
2633 if (force_on)
2634 iommu_disable_protect_mem_regions(iommu);
2635 return 0;
2636 }
2637
2638 intel_iommu_init_qi(iommu);
2639 iommu_flush_write_buffer(iommu);
2640
2641 if (ecap_prs(iommu->ecap)) {
2642 ret = intel_iommu_enable_prq(iommu);
2643 if (ret)
2644 goto disable_iommu;
2645 }
2646
2647 ret = dmar_set_interrupt(iommu);
2648 if (ret)
2649 goto disable_iommu;
2650
2651 iommu_set_root_entry(iommu);
2652 iommu_enable_translation(iommu);
2653
2654 iommu_disable_protect_mem_regions(iommu);
2655 return 0;
2656
2657 disable_iommu:
2658 disable_dmar_iommu(iommu);
2659 out:
2660 free_dmar_iommu(iommu);
2661 return ret;
2662 }
2663
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2664 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2665 {
2666 int ret = 0;
2667 struct intel_iommu *iommu = dmaru->iommu;
2668
2669 if (!intel_iommu_enabled)
2670 return 0;
2671 if (iommu == NULL)
2672 return -EINVAL;
2673
2674 if (insert) {
2675 ret = intel_iommu_add(dmaru);
2676 } else {
2677 disable_dmar_iommu(iommu);
2678 free_dmar_iommu(iommu);
2679 }
2680
2681 return ret;
2682 }
2683
intel_iommu_free_dmars(void)2684 static void intel_iommu_free_dmars(void)
2685 {
2686 struct dmar_rmrr_unit *rmrru, *rmrr_n;
2687 struct dmar_atsr_unit *atsru, *atsr_n;
2688 struct dmar_satc_unit *satcu, *satc_n;
2689
2690 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2691 list_del(&rmrru->list);
2692 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2693 kfree(rmrru);
2694 }
2695
2696 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2697 list_del(&atsru->list);
2698 intel_iommu_free_atsr(atsru);
2699 }
2700 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2701 list_del(&satcu->list);
2702 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2703 kfree(satcu);
2704 }
2705 }
2706
dmar_find_matched_satc_unit(struct pci_dev * dev)2707 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2708 {
2709 struct dmar_satc_unit *satcu;
2710 struct acpi_dmar_satc *satc;
2711 struct device *tmp;
2712 int i;
2713
2714 dev = pci_physfn(dev);
2715 rcu_read_lock();
2716
2717 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2718 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2719 if (satc->segment != pci_domain_nr(dev->bus))
2720 continue;
2721 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2722 if (to_pci_dev(tmp) == dev)
2723 goto out;
2724 }
2725 satcu = NULL;
2726 out:
2727 rcu_read_unlock();
2728 return satcu;
2729 }
2730
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2731 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2732 {
2733 int i, ret = 1;
2734 struct pci_bus *bus;
2735 struct pci_dev *bridge = NULL;
2736 struct device *tmp;
2737 struct acpi_dmar_atsr *atsr;
2738 struct dmar_atsr_unit *atsru;
2739 struct dmar_satc_unit *satcu;
2740
2741 dev = pci_physfn(dev);
2742 satcu = dmar_find_matched_satc_unit(dev);
2743 if (satcu)
2744 /*
2745 * This device supports ATS as it is in SATC table.
2746 * When IOMMU is in legacy mode, enabling ATS is done
2747 * automatically by HW for the device that requires
2748 * ATS, hence OS should not enable this device ATS
2749 * to avoid duplicated TLB invalidation.
2750 */
2751 return !(satcu->atc_required && !sm_supported(iommu));
2752
2753 for (bus = dev->bus; bus; bus = bus->parent) {
2754 bridge = bus->self;
2755 /* If it's an integrated device, allow ATS */
2756 if (!bridge)
2757 return 1;
2758 /* Connected via non-PCIe: no ATS */
2759 if (!pci_is_pcie(bridge) ||
2760 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2761 return 0;
2762 /* If we found the root port, look it up in the ATSR */
2763 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2764 break;
2765 }
2766
2767 rcu_read_lock();
2768 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2769 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2770 if (atsr->segment != pci_domain_nr(dev->bus))
2771 continue;
2772
2773 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2774 if (tmp == &bridge->dev)
2775 goto out;
2776
2777 if (atsru->include_all)
2778 goto out;
2779 }
2780 ret = 0;
2781 out:
2782 rcu_read_unlock();
2783
2784 return ret;
2785 }
2786
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2787 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2788 {
2789 int ret;
2790 struct dmar_rmrr_unit *rmrru;
2791 struct dmar_atsr_unit *atsru;
2792 struct dmar_satc_unit *satcu;
2793 struct acpi_dmar_atsr *atsr;
2794 struct acpi_dmar_reserved_memory *rmrr;
2795 struct acpi_dmar_satc *satc;
2796
2797 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2798 return 0;
2799
2800 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2801 rmrr = container_of(rmrru->hdr,
2802 struct acpi_dmar_reserved_memory, header);
2803 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2804 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2805 ((void *)rmrr) + rmrr->header.length,
2806 rmrr->segment, rmrru->devices,
2807 rmrru->devices_cnt);
2808 if (ret < 0)
2809 return ret;
2810 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2811 dmar_remove_dev_scope(info, rmrr->segment,
2812 rmrru->devices, rmrru->devices_cnt);
2813 }
2814 }
2815
2816 list_for_each_entry(atsru, &dmar_atsr_units, list) {
2817 if (atsru->include_all)
2818 continue;
2819
2820 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2821 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2822 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2823 (void *)atsr + atsr->header.length,
2824 atsr->segment, atsru->devices,
2825 atsru->devices_cnt);
2826 if (ret > 0)
2827 break;
2828 else if (ret < 0)
2829 return ret;
2830 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2831 if (dmar_remove_dev_scope(info, atsr->segment,
2832 atsru->devices, atsru->devices_cnt))
2833 break;
2834 }
2835 }
2836 list_for_each_entry(satcu, &dmar_satc_units, list) {
2837 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2838 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2839 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2840 (void *)satc + satc->header.length,
2841 satc->segment, satcu->devices,
2842 satcu->devices_cnt);
2843 if (ret > 0)
2844 break;
2845 else if (ret < 0)
2846 return ret;
2847 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2848 if (dmar_remove_dev_scope(info, satc->segment,
2849 satcu->devices, satcu->devices_cnt))
2850 break;
2851 }
2852 }
2853
2854 return 0;
2855 }
2856
intel_disable_iommus(void)2857 static void intel_disable_iommus(void)
2858 {
2859 struct intel_iommu *iommu = NULL;
2860 struct dmar_drhd_unit *drhd;
2861
2862 for_each_iommu(iommu, drhd)
2863 iommu_disable_translation(iommu);
2864 }
2865
intel_iommu_shutdown(void)2866 void intel_iommu_shutdown(void)
2867 {
2868 struct dmar_drhd_unit *drhd;
2869 struct intel_iommu *iommu = NULL;
2870
2871 if (no_iommu || dmar_disabled)
2872 return;
2873
2874 /*
2875 * All other CPUs were brought down, hotplug interrupts were disabled,
2876 * no lock and RCU checking needed anymore
2877 */
2878 list_for_each_entry(drhd, &dmar_drhd_units, list) {
2879 iommu = drhd->iommu;
2880
2881 /* Disable PMRs explicitly here. */
2882 iommu_disable_protect_mem_regions(iommu);
2883
2884 /* Make sure the IOMMUs are switched off */
2885 iommu_disable_translation(iommu);
2886 }
2887 }
2888
dev_to_intel_iommu(struct device * dev)2889 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2890 {
2891 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2892
2893 return container_of(iommu_dev, struct intel_iommu, iommu);
2894 }
2895
version_show(struct device * dev,struct device_attribute * attr,char * buf)2896 static ssize_t version_show(struct device *dev,
2897 struct device_attribute *attr, char *buf)
2898 {
2899 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2900 u32 ver = readl(iommu->reg + DMAR_VER_REG);
2901 return sysfs_emit(buf, "%d:%d\n",
2902 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2903 }
2904 static DEVICE_ATTR_RO(version);
2905
address_show(struct device * dev,struct device_attribute * attr,char * buf)2906 static ssize_t address_show(struct device *dev,
2907 struct device_attribute *attr, char *buf)
2908 {
2909 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2910 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2911 }
2912 static DEVICE_ATTR_RO(address);
2913
cap_show(struct device * dev,struct device_attribute * attr,char * buf)2914 static ssize_t cap_show(struct device *dev,
2915 struct device_attribute *attr, char *buf)
2916 {
2917 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2918 return sysfs_emit(buf, "%llx\n", iommu->cap);
2919 }
2920 static DEVICE_ATTR_RO(cap);
2921
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)2922 static ssize_t ecap_show(struct device *dev,
2923 struct device_attribute *attr, char *buf)
2924 {
2925 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2926 return sysfs_emit(buf, "%llx\n", iommu->ecap);
2927 }
2928 static DEVICE_ATTR_RO(ecap);
2929
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)2930 static ssize_t domains_supported_show(struct device *dev,
2931 struct device_attribute *attr, char *buf)
2932 {
2933 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2934 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2935 }
2936 static DEVICE_ATTR_RO(domains_supported);
2937
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)2938 static ssize_t domains_used_show(struct device *dev,
2939 struct device_attribute *attr, char *buf)
2940 {
2941 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2942 return sysfs_emit(buf, "%d\n",
2943 bitmap_weight(iommu->domain_ids,
2944 cap_ndoms(iommu->cap)));
2945 }
2946 static DEVICE_ATTR_RO(domains_used);
2947
2948 static struct attribute *intel_iommu_attrs[] = {
2949 &dev_attr_version.attr,
2950 &dev_attr_address.attr,
2951 &dev_attr_cap.attr,
2952 &dev_attr_ecap.attr,
2953 &dev_attr_domains_supported.attr,
2954 &dev_attr_domains_used.attr,
2955 NULL,
2956 };
2957
2958 static struct attribute_group intel_iommu_group = {
2959 .name = "intel-iommu",
2960 .attrs = intel_iommu_attrs,
2961 };
2962
2963 const struct attribute_group *intel_iommu_groups[] = {
2964 &intel_iommu_group,
2965 NULL,
2966 };
2967
has_external_pci(void)2968 static bool has_external_pci(void)
2969 {
2970 struct pci_dev *pdev = NULL;
2971
2972 for_each_pci_dev(pdev)
2973 if (pdev->external_facing) {
2974 pci_dev_put(pdev);
2975 return true;
2976 }
2977
2978 return false;
2979 }
2980
platform_optin_force_iommu(void)2981 static int __init platform_optin_force_iommu(void)
2982 {
2983 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2984 return 0;
2985
2986 if (no_iommu || dmar_disabled)
2987 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2988
2989 /*
2990 * If Intel-IOMMU is disabled by default, we will apply identity
2991 * map for all devices except those marked as being untrusted.
2992 */
2993 if (dmar_disabled)
2994 iommu_set_default_passthrough(false);
2995
2996 dmar_disabled = 0;
2997 no_iommu = 0;
2998
2999 return 1;
3000 }
3001
probe_acpi_namespace_devices(void)3002 static int __init probe_acpi_namespace_devices(void)
3003 {
3004 struct dmar_drhd_unit *drhd;
3005 /* To avoid a -Wunused-but-set-variable warning. */
3006 struct intel_iommu *iommu __maybe_unused;
3007 struct device *dev;
3008 int i, ret = 0;
3009
3010 for_each_active_iommu(iommu, drhd) {
3011 for_each_active_dev_scope(drhd->devices,
3012 drhd->devices_cnt, i, dev) {
3013 struct acpi_device_physical_node *pn;
3014 struct acpi_device *adev;
3015
3016 if (dev->bus != &acpi_bus_type)
3017 continue;
3018
3019 up_read(&dmar_global_lock);
3020 adev = to_acpi_device(dev);
3021 mutex_lock(&adev->physical_node_lock);
3022 list_for_each_entry(pn,
3023 &adev->physical_node_list, node) {
3024 ret = iommu_probe_device(pn->dev);
3025 if (ret)
3026 break;
3027 }
3028 mutex_unlock(&adev->physical_node_lock);
3029 down_read(&dmar_global_lock);
3030
3031 if (ret)
3032 return ret;
3033 }
3034 }
3035
3036 return 0;
3037 }
3038
tboot_force_iommu(void)3039 static __init int tboot_force_iommu(void)
3040 {
3041 if (!tboot_enabled())
3042 return 0;
3043
3044 if (no_iommu || dmar_disabled)
3045 pr_warn("Forcing Intel-IOMMU to enabled\n");
3046
3047 dmar_disabled = 0;
3048 no_iommu = 0;
3049
3050 return 1;
3051 }
3052
intel_iommu_init(void)3053 int __init intel_iommu_init(void)
3054 {
3055 int ret = -ENODEV;
3056 struct dmar_drhd_unit *drhd;
3057 struct intel_iommu *iommu;
3058
3059 /*
3060 * Intel IOMMU is required for a TXT/tboot launch or platform
3061 * opt in, so enforce that.
3062 */
3063 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3064 platform_optin_force_iommu();
3065
3066 down_write(&dmar_global_lock);
3067 if (dmar_table_init()) {
3068 if (force_on)
3069 panic("tboot: Failed to initialize DMAR table\n");
3070 goto out_free_dmar;
3071 }
3072
3073 if (dmar_dev_scope_init() < 0) {
3074 if (force_on)
3075 panic("tboot: Failed to initialize DMAR device scope\n");
3076 goto out_free_dmar;
3077 }
3078
3079 up_write(&dmar_global_lock);
3080
3081 /*
3082 * The bus notifier takes the dmar_global_lock, so lockdep will
3083 * complain later when we register it under the lock.
3084 */
3085 dmar_register_bus_notifier();
3086
3087 down_write(&dmar_global_lock);
3088
3089 if (!no_iommu)
3090 intel_iommu_debugfs_init();
3091
3092 if (no_iommu || dmar_disabled) {
3093 /*
3094 * We exit the function here to ensure IOMMU's remapping and
3095 * mempool aren't setup, which means that the IOMMU's PMRs
3096 * won't be disabled via the call to init_dmars(). So disable
3097 * it explicitly here. The PMRs were setup by tboot prior to
3098 * calling SENTER, but the kernel is expected to reset/tear
3099 * down the PMRs.
3100 */
3101 if (intel_iommu_tboot_noforce) {
3102 for_each_iommu(iommu, drhd)
3103 iommu_disable_protect_mem_regions(iommu);
3104 }
3105
3106 /*
3107 * Make sure the IOMMUs are switched off, even when we
3108 * boot into a kexec kernel and the previous kernel left
3109 * them enabled
3110 */
3111 intel_disable_iommus();
3112 goto out_free_dmar;
3113 }
3114
3115 if (list_empty(&dmar_rmrr_units))
3116 pr_info("No RMRR found\n");
3117
3118 if (list_empty(&dmar_atsr_units))
3119 pr_info("No ATSR found\n");
3120
3121 if (list_empty(&dmar_satc_units))
3122 pr_info("No SATC found\n");
3123
3124 init_no_remapping_devices();
3125
3126 ret = init_dmars();
3127 if (ret) {
3128 if (force_on)
3129 panic("tboot: Failed to initialize DMARs\n");
3130 pr_err("Initialization failed\n");
3131 goto out_free_dmar;
3132 }
3133 up_write(&dmar_global_lock);
3134
3135 init_iommu_pm_ops();
3136
3137 down_read(&dmar_global_lock);
3138 for_each_active_iommu(iommu, drhd) {
3139 /*
3140 * The flush queue implementation does not perform
3141 * page-selective invalidations that are required for efficient
3142 * TLB flushes in virtual environments. The benefit of batching
3143 * is likely to be much lower than the overhead of synchronizing
3144 * the virtual and physical IOMMU page-tables.
3145 */
3146 if (cap_caching_mode(iommu->cap) &&
3147 !first_level_by_default(iommu)) {
3148 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3149 iommu_set_dma_strict();
3150 }
3151 iommu_device_sysfs_add(&iommu->iommu, NULL,
3152 intel_iommu_groups,
3153 "%s", iommu->name);
3154 /*
3155 * The iommu device probe is protected by the iommu_probe_device_lock.
3156 * Release the dmar_global_lock before entering the device probe path
3157 * to avoid unnecessary lock order splat.
3158 */
3159 up_read(&dmar_global_lock);
3160 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3161 down_read(&dmar_global_lock);
3162
3163 iommu_pmu_register(iommu);
3164 }
3165
3166 if (probe_acpi_namespace_devices())
3167 pr_warn("ACPI name space devices didn't probe correctly\n");
3168
3169 /* Finally, we enable the DMA remapping hardware. */
3170 for_each_iommu(iommu, drhd) {
3171 if (!drhd->ignored && !translation_pre_enabled(iommu))
3172 iommu_enable_translation(iommu);
3173
3174 iommu_disable_protect_mem_regions(iommu);
3175 }
3176 up_read(&dmar_global_lock);
3177
3178 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3179
3180 intel_iommu_enabled = 1;
3181
3182 return 0;
3183
3184 out_free_dmar:
3185 intel_iommu_free_dmars();
3186 up_write(&dmar_global_lock);
3187 return ret;
3188 }
3189
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3190 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3191 {
3192 struct device_domain_info *info = opaque;
3193
3194 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3195 return 0;
3196 }
3197
3198 /*
3199 * NB - intel-iommu lacks any sort of reference counting for the users of
3200 * dependent devices. If multiple endpoints have intersecting dependent
3201 * devices, unbinding the driver from any one of them will possibly leave
3202 * the others unable to operate.
3203 */
domain_context_clear(struct device_domain_info * info)3204 static void domain_context_clear(struct device_domain_info *info)
3205 {
3206 if (!dev_is_pci(info->dev)) {
3207 domain_context_clear_one(info, info->bus, info->devfn);
3208 return;
3209 }
3210
3211 pci_for_each_dma_alias(to_pci_dev(info->dev),
3212 &domain_context_clear_one_cb, info);
3213 }
3214
3215 /*
3216 * Clear the page table pointer in context or pasid table entries so that
3217 * all DMA requests without PASID from the device are blocked. If the page
3218 * table has been set, clean up the data structures.
3219 */
device_block_translation(struct device * dev)3220 void device_block_translation(struct device *dev)
3221 {
3222 struct device_domain_info *info = dev_iommu_priv_get(dev);
3223 struct intel_iommu *iommu = info->iommu;
3224 unsigned long flags;
3225
3226 if (info->domain)
3227 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3228
3229 iommu_disable_pci_caps(info);
3230 if (!dev_is_real_dma_subdevice(dev)) {
3231 if (sm_supported(iommu))
3232 intel_pasid_tear_down_entry(iommu, dev,
3233 IOMMU_NO_PASID, false);
3234 else
3235 domain_context_clear(info);
3236 }
3237
3238 if (!info->domain)
3239 return;
3240
3241 spin_lock_irqsave(&info->domain->lock, flags);
3242 list_del(&info->link);
3243 spin_unlock_irqrestore(&info->domain->lock, flags);
3244
3245 domain_detach_iommu(info->domain, iommu);
3246 info->domain = NULL;
3247 }
3248
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3249 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3250 struct device *dev)
3251 {
3252 device_block_translation(dev);
3253 return 0;
3254 }
3255
3256 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3257 struct device *dev, ioasid_t pasid,
3258 struct iommu_domain *old);
3259
3260 static struct iommu_domain blocking_domain = {
3261 .type = IOMMU_DOMAIN_BLOCKED,
3262 .ops = &(const struct iommu_domain_ops) {
3263 .attach_dev = blocking_domain_attach_dev,
3264 .set_dev_pasid = blocking_domain_set_dev_pasid,
3265 }
3266 };
3267
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3268 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3269 {
3270 if (!intel_iommu_superpage)
3271 return 0;
3272
3273 if (first_stage)
3274 return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3275
3276 return fls(cap_super_page_val(iommu->cap));
3277 }
3278
paging_domain_alloc(struct device * dev,bool first_stage)3279 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3280 {
3281 struct device_domain_info *info = dev_iommu_priv_get(dev);
3282 struct intel_iommu *iommu = info->iommu;
3283 struct dmar_domain *domain;
3284 int addr_width;
3285
3286 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3287 if (!domain)
3288 return ERR_PTR(-ENOMEM);
3289
3290 INIT_LIST_HEAD(&domain->devices);
3291 INIT_LIST_HEAD(&domain->dev_pasids);
3292 INIT_LIST_HEAD(&domain->cache_tags);
3293 spin_lock_init(&domain->lock);
3294 spin_lock_init(&domain->cache_lock);
3295 xa_init(&domain->iommu_array);
3296
3297 domain->nid = dev_to_node(dev);
3298 domain->use_first_level = first_stage;
3299
3300 /* calculate the address width */
3301 addr_width = agaw_to_width(iommu->agaw);
3302 if (addr_width > cap_mgaw(iommu->cap))
3303 addr_width = cap_mgaw(iommu->cap);
3304 domain->gaw = addr_width;
3305 domain->agaw = iommu->agaw;
3306 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3307
3308 /* iommu memory access coherency */
3309 domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3310
3311 /* pagesize bitmap */
3312 domain->domain.pgsize_bitmap = SZ_4K;
3313 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3314 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3315
3316 /*
3317 * IOVA aperture: First-level translation restricts the input-address
3318 * to a canonical address (i.e., address bits 63:N have the same value
3319 * as address bit [N-1], where N is 48-bits with 4-level paging and
3320 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3321 */
3322 domain->domain.geometry.force_aperture = true;
3323 domain->domain.geometry.aperture_start = 0;
3324 if (first_stage)
3325 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3326 else
3327 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3328
3329 /* always allocate the top pgd */
3330 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3331 if (!domain->pgd) {
3332 kfree(domain);
3333 return ERR_PTR(-ENOMEM);
3334 }
3335 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3336
3337 return domain;
3338 }
3339
3340 static struct iommu_domain *
intel_iommu_domain_alloc_paging_flags(struct device * dev,u32 flags,const struct iommu_user_data * user_data)3341 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3342 const struct iommu_user_data *user_data)
3343 {
3344 struct device_domain_info *info = dev_iommu_priv_get(dev);
3345 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3346 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3347 struct intel_iommu *iommu = info->iommu;
3348 struct dmar_domain *dmar_domain;
3349 struct iommu_domain *domain;
3350 bool first_stage;
3351
3352 if (flags &
3353 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3354 return ERR_PTR(-EOPNOTSUPP);
3355 if (nested_parent && !nested_supported(iommu))
3356 return ERR_PTR(-EOPNOTSUPP);
3357 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3358 return ERR_PTR(-EOPNOTSUPP);
3359
3360 /*
3361 * Always allocate the guest compatible page table unless
3362 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3363 * is specified.
3364 */
3365 if (nested_parent || dirty_tracking) {
3366 if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3367 return ERR_PTR(-EOPNOTSUPP);
3368 first_stage = false;
3369 } else {
3370 first_stage = first_level_by_default(iommu);
3371 }
3372
3373 dmar_domain = paging_domain_alloc(dev, first_stage);
3374 if (IS_ERR(dmar_domain))
3375 return ERR_CAST(dmar_domain);
3376 domain = &dmar_domain->domain;
3377 domain->type = IOMMU_DOMAIN_UNMANAGED;
3378 domain->owner = &intel_iommu_ops;
3379 domain->ops = intel_iommu_ops.default_domain_ops;
3380
3381 if (nested_parent) {
3382 dmar_domain->nested_parent = true;
3383 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3384 spin_lock_init(&dmar_domain->s1_lock);
3385 }
3386
3387 if (dirty_tracking) {
3388 if (dmar_domain->use_first_level) {
3389 iommu_domain_free(domain);
3390 return ERR_PTR(-EOPNOTSUPP);
3391 }
3392 domain->dirty_ops = &intel_dirty_ops;
3393 }
3394
3395 return domain;
3396 }
3397
intel_iommu_domain_free(struct iommu_domain * domain)3398 static void intel_iommu_domain_free(struct iommu_domain *domain)
3399 {
3400 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3401
3402 WARN_ON(dmar_domain->nested_parent &&
3403 !list_empty(&dmar_domain->s1_domains));
3404 domain_exit(dmar_domain);
3405 }
3406
paging_domain_compatible(struct iommu_domain * domain,struct device * dev)3407 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3408 {
3409 struct device_domain_info *info = dev_iommu_priv_get(dev);
3410 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3411 struct intel_iommu *iommu = info->iommu;
3412 int addr_width;
3413
3414 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3415 return -EPERM;
3416
3417 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3418 return -EINVAL;
3419
3420 if (domain->dirty_ops && !ssads_supported(iommu))
3421 return -EINVAL;
3422
3423 if (dmar_domain->iommu_coherency !=
3424 iommu_paging_structure_coherency(iommu))
3425 return -EINVAL;
3426
3427 if (dmar_domain->iommu_superpage !=
3428 iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3429 return -EINVAL;
3430
3431 if (dmar_domain->use_first_level &&
3432 (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3433 return -EINVAL;
3434
3435 /* check if this iommu agaw is sufficient for max mapped address */
3436 addr_width = agaw_to_width(iommu->agaw);
3437 if (addr_width > cap_mgaw(iommu->cap))
3438 addr_width = cap_mgaw(iommu->cap);
3439
3440 if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3441 return -EINVAL;
3442
3443 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3444 context_copied(iommu, info->bus, info->devfn))
3445 return intel_pasid_setup_sm_context(dev);
3446
3447 return 0;
3448 }
3449
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3450 static int intel_iommu_attach_device(struct iommu_domain *domain,
3451 struct device *dev)
3452 {
3453 int ret;
3454
3455 device_block_translation(dev);
3456
3457 ret = paging_domain_compatible(domain, dev);
3458 if (ret)
3459 return ret;
3460
3461 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3462 }
3463
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3464 static int intel_iommu_map(struct iommu_domain *domain,
3465 unsigned long iova, phys_addr_t hpa,
3466 size_t size, int iommu_prot, gfp_t gfp)
3467 {
3468 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3469 u64 max_addr;
3470 int prot = 0;
3471
3472 if (iommu_prot & IOMMU_READ)
3473 prot |= DMA_PTE_READ;
3474 if (iommu_prot & IOMMU_WRITE)
3475 prot |= DMA_PTE_WRITE;
3476 if (dmar_domain->set_pte_snp)
3477 prot |= DMA_PTE_SNP;
3478
3479 max_addr = iova + size;
3480 if (dmar_domain->max_addr < max_addr) {
3481 u64 end;
3482
3483 /* check if minimum agaw is sufficient for mapped address */
3484 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3485 if (end < max_addr) {
3486 pr_err("%s: iommu width (%d) is not "
3487 "sufficient for the mapped address (%llx)\n",
3488 __func__, dmar_domain->gaw, max_addr);
3489 return -EFAULT;
3490 }
3491 dmar_domain->max_addr = max_addr;
3492 }
3493 /* Round up size to next multiple of PAGE_SIZE, if it and
3494 the low bits of hpa would take us onto the next page */
3495 size = aligned_nrpages(hpa, size);
3496 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3497 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3498 }
3499
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3500 static int intel_iommu_map_pages(struct iommu_domain *domain,
3501 unsigned long iova, phys_addr_t paddr,
3502 size_t pgsize, size_t pgcount,
3503 int prot, gfp_t gfp, size_t *mapped)
3504 {
3505 unsigned long pgshift = __ffs(pgsize);
3506 size_t size = pgcount << pgshift;
3507 int ret;
3508
3509 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3510 return -EINVAL;
3511
3512 if (!IS_ALIGNED(iova | paddr, pgsize))
3513 return -EINVAL;
3514
3515 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3516 if (!ret && mapped)
3517 *mapped = size;
3518
3519 return ret;
3520 }
3521
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3522 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3523 unsigned long iova, size_t size,
3524 struct iommu_iotlb_gather *gather)
3525 {
3526 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3527 unsigned long start_pfn, last_pfn;
3528 int level = 0;
3529
3530 /* Cope with horrid API which requires us to unmap more than the
3531 size argument if it happens to be a large-page mapping. */
3532 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3533 &level, GFP_ATOMIC)))
3534 return 0;
3535
3536 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3537 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3538
3539 start_pfn = iova >> VTD_PAGE_SHIFT;
3540 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3541
3542 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3543
3544 if (dmar_domain->max_addr == iova + size)
3545 dmar_domain->max_addr = iova;
3546
3547 /*
3548 * We do not use page-selective IOTLB invalidation in flush queue,
3549 * so there is no need to track page and sync iotlb.
3550 */
3551 if (!iommu_iotlb_gather_queued(gather))
3552 iommu_iotlb_gather_add_page(domain, gather, iova, size);
3553
3554 return size;
3555 }
3556
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3557 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3558 unsigned long iova,
3559 size_t pgsize, size_t pgcount,
3560 struct iommu_iotlb_gather *gather)
3561 {
3562 unsigned long pgshift = __ffs(pgsize);
3563 size_t size = pgcount << pgshift;
3564
3565 return intel_iommu_unmap(domain, iova, size, gather);
3566 }
3567
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3568 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3569 struct iommu_iotlb_gather *gather)
3570 {
3571 cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3572 gather->end, list_empty(&gather->freelist));
3573 iommu_put_pages_list(&gather->freelist);
3574 }
3575
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3576 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3577 dma_addr_t iova)
3578 {
3579 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3580 struct dma_pte *pte;
3581 int level = 0;
3582 u64 phys = 0;
3583
3584 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3585 GFP_ATOMIC);
3586 if (pte && dma_pte_present(pte))
3587 phys = dma_pte_addr(pte) +
3588 (iova & (BIT_MASK(level_to_offset_bits(level) +
3589 VTD_PAGE_SHIFT) - 1));
3590
3591 return phys;
3592 }
3593
domain_support_force_snooping(struct dmar_domain * domain)3594 static bool domain_support_force_snooping(struct dmar_domain *domain)
3595 {
3596 struct device_domain_info *info;
3597 bool support = true;
3598
3599 assert_spin_locked(&domain->lock);
3600 list_for_each_entry(info, &domain->devices, link) {
3601 if (!ecap_sc_support(info->iommu->ecap)) {
3602 support = false;
3603 break;
3604 }
3605 }
3606
3607 return support;
3608 }
3609
domain_set_force_snooping(struct dmar_domain * domain)3610 static void domain_set_force_snooping(struct dmar_domain *domain)
3611 {
3612 struct device_domain_info *info;
3613
3614 assert_spin_locked(&domain->lock);
3615 /*
3616 * Second level page table supports per-PTE snoop control. The
3617 * iommu_map() interface will handle this by setting SNP bit.
3618 */
3619 if (!domain->use_first_level) {
3620 domain->set_pte_snp = true;
3621 return;
3622 }
3623
3624 list_for_each_entry(info, &domain->devices, link)
3625 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3626 IOMMU_NO_PASID);
3627 }
3628
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)3629 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3630 {
3631 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3632 unsigned long flags;
3633
3634 if (dmar_domain->force_snooping)
3635 return true;
3636
3637 spin_lock_irqsave(&dmar_domain->lock, flags);
3638 if (!domain_support_force_snooping(dmar_domain) ||
3639 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3640 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3641 return false;
3642 }
3643
3644 domain_set_force_snooping(dmar_domain);
3645 dmar_domain->force_snooping = true;
3646 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3647
3648 return true;
3649 }
3650
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3651 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3652 {
3653 struct device_domain_info *info = dev_iommu_priv_get(dev);
3654
3655 switch (cap) {
3656 case IOMMU_CAP_CACHE_COHERENCY:
3657 case IOMMU_CAP_DEFERRED_FLUSH:
3658 return true;
3659 case IOMMU_CAP_PRE_BOOT_PROTECTION:
3660 return dmar_platform_optin();
3661 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3662 return ecap_sc_support(info->iommu->ecap);
3663 case IOMMU_CAP_DIRTY_TRACKING:
3664 return ssads_supported(info->iommu);
3665 default:
3666 return false;
3667 }
3668 }
3669
intel_iommu_probe_device(struct device * dev)3670 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3671 {
3672 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3673 struct device_domain_info *info;
3674 struct intel_iommu *iommu;
3675 u8 bus, devfn;
3676 int ret;
3677
3678 iommu = device_lookup_iommu(dev, &bus, &devfn);
3679 if (!iommu || !iommu->iommu.ops)
3680 return ERR_PTR(-ENODEV);
3681
3682 info = kzalloc(sizeof(*info), GFP_KERNEL);
3683 if (!info)
3684 return ERR_PTR(-ENOMEM);
3685
3686 if (dev_is_real_dma_subdevice(dev)) {
3687 info->bus = pdev->bus->number;
3688 info->devfn = pdev->devfn;
3689 info->segment = pci_domain_nr(pdev->bus);
3690 } else {
3691 info->bus = bus;
3692 info->devfn = devfn;
3693 info->segment = iommu->segment;
3694 }
3695
3696 info->dev = dev;
3697 info->iommu = iommu;
3698 if (dev_is_pci(dev)) {
3699 if (ecap_dev_iotlb_support(iommu->ecap) &&
3700 pci_ats_supported(pdev) &&
3701 dmar_ats_supported(pdev, iommu)) {
3702 info->ats_supported = 1;
3703 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3704
3705 /*
3706 * For IOMMU that supports device IOTLB throttling
3707 * (DIT), we assign PFSID to the invalidation desc
3708 * of a VF such that IOMMU HW can gauge queue depth
3709 * at PF level. If DIT is not set, PFSID will be
3710 * treated as reserved, which should be set to 0.
3711 */
3712 if (ecap_dit(iommu->ecap))
3713 info->pfsid = pci_dev_id(pci_physfn(pdev));
3714 info->ats_qdep = pci_ats_queue_depth(pdev);
3715 }
3716 if (sm_supported(iommu)) {
3717 if (pasid_supported(iommu)) {
3718 int features = pci_pasid_features(pdev);
3719
3720 if (features >= 0)
3721 info->pasid_supported = features | 1;
3722 }
3723
3724 if (info->ats_supported && ecap_prs(iommu->ecap) &&
3725 pci_pri_supported(pdev))
3726 info->pri_supported = 1;
3727 }
3728 }
3729
3730 dev_iommu_priv_set(dev, info);
3731 if (pdev && pci_ats_supported(pdev)) {
3732 pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3733 ret = device_rbtree_insert(iommu, info);
3734 if (ret)
3735 goto free;
3736 }
3737
3738 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3739 ret = intel_pasid_alloc_table(dev);
3740 if (ret) {
3741 dev_err(dev, "PASID table allocation failed\n");
3742 goto clear_rbtree;
3743 }
3744
3745 if (!context_copied(iommu, info->bus, info->devfn)) {
3746 ret = intel_pasid_setup_sm_context(dev);
3747 if (ret)
3748 goto free_table;
3749 }
3750 }
3751
3752 intel_iommu_debugfs_create_dev(info);
3753
3754 /*
3755 * The PCIe spec, in its wisdom, declares that the behaviour of the
3756 * device is undefined if you enable PASID support after ATS support.
3757 * So always enable PASID support on devices which have it, even if
3758 * we can't yet know if we're ever going to use it.
3759 */
3760 if (info->pasid_supported &&
3761 !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3762 info->pasid_enabled = 1;
3763
3764 return &iommu->iommu;
3765 free_table:
3766 intel_pasid_free_table(dev);
3767 clear_rbtree:
3768 device_rbtree_remove(info);
3769 free:
3770 kfree(info);
3771
3772 return ERR_PTR(ret);
3773 }
3774
intel_iommu_release_device(struct device * dev)3775 static void intel_iommu_release_device(struct device *dev)
3776 {
3777 struct device_domain_info *info = dev_iommu_priv_get(dev);
3778 struct intel_iommu *iommu = info->iommu;
3779
3780 if (info->pasid_enabled) {
3781 pci_disable_pasid(to_pci_dev(dev));
3782 info->pasid_enabled = 0;
3783 }
3784
3785 mutex_lock(&iommu->iopf_lock);
3786 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3787 device_rbtree_remove(info);
3788 mutex_unlock(&iommu->iopf_lock);
3789
3790 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3791 !context_copied(iommu, info->bus, info->devfn))
3792 intel_pasid_teardown_sm_context(dev);
3793
3794 intel_pasid_free_table(dev);
3795 intel_iommu_debugfs_remove_dev(info);
3796 kfree(info);
3797 set_dma_ops(dev, NULL);
3798 }
3799
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)3800 static void intel_iommu_get_resv_regions(struct device *device,
3801 struct list_head *head)
3802 {
3803 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3804 struct iommu_resv_region *reg;
3805 struct dmar_rmrr_unit *rmrr;
3806 struct device *i_dev;
3807 int i;
3808
3809 rcu_read_lock();
3810 for_each_rmrr_units(rmrr) {
3811 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3812 i, i_dev) {
3813 struct iommu_resv_region *resv;
3814 enum iommu_resv_type type;
3815 size_t length;
3816
3817 if (i_dev != device &&
3818 !is_downstream_to_pci_bridge(device, i_dev))
3819 continue;
3820
3821 length = rmrr->end_address - rmrr->base_address + 1;
3822
3823 type = device_rmrr_is_relaxable(device) ?
3824 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3825
3826 resv = iommu_alloc_resv_region(rmrr->base_address,
3827 length, prot, type,
3828 GFP_ATOMIC);
3829 if (!resv)
3830 break;
3831
3832 list_add_tail(&resv->list, head);
3833 }
3834 }
3835 rcu_read_unlock();
3836
3837 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3838 if (dev_is_pci(device)) {
3839 struct pci_dev *pdev = to_pci_dev(device);
3840
3841 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3842 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3843 IOMMU_RESV_DIRECT_RELAXABLE,
3844 GFP_KERNEL);
3845 if (reg)
3846 list_add_tail(®->list, head);
3847 }
3848 }
3849 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3850
3851 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3852 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3853 0, IOMMU_RESV_MSI, GFP_KERNEL);
3854 if (!reg)
3855 return;
3856 list_add_tail(®->list, head);
3857 }
3858
intel_iommu_device_group(struct device * dev)3859 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3860 {
3861 if (dev_is_pci(dev))
3862 return pci_device_group(dev);
3863 return generic_device_group(dev);
3864 }
3865
intel_iommu_enable_sva(struct device * dev)3866 static int intel_iommu_enable_sva(struct device *dev)
3867 {
3868 struct device_domain_info *info = dev_iommu_priv_get(dev);
3869 struct intel_iommu *iommu;
3870
3871 if (!info || dmar_disabled)
3872 return -EINVAL;
3873
3874 iommu = info->iommu;
3875 if (!iommu)
3876 return -EINVAL;
3877
3878 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
3879 return -ENODEV;
3880
3881 if (!info->pasid_enabled || !info->ats_enabled)
3882 return -EINVAL;
3883
3884 /*
3885 * Devices having device-specific I/O fault handling should not
3886 * support PCI/PRI. The IOMMU side has no means to check the
3887 * capability of device-specific IOPF. Therefore, IOMMU can only
3888 * default that if the device driver enables SVA on a non-PRI
3889 * device, it will handle IOPF in its own way.
3890 */
3891 if (!info->pri_supported)
3892 return 0;
3893
3894 /* Devices supporting PRI should have it enabled. */
3895 if (!info->pri_enabled)
3896 return -EINVAL;
3897
3898 return 0;
3899 }
3900
context_flip_pri(struct device_domain_info * info,bool enable)3901 static int context_flip_pri(struct device_domain_info *info, bool enable)
3902 {
3903 struct intel_iommu *iommu = info->iommu;
3904 u8 bus = info->bus, devfn = info->devfn;
3905 struct context_entry *context;
3906 u16 did;
3907
3908 spin_lock(&iommu->lock);
3909 if (context_copied(iommu, bus, devfn)) {
3910 spin_unlock(&iommu->lock);
3911 return -EINVAL;
3912 }
3913
3914 context = iommu_context_addr(iommu, bus, devfn, false);
3915 if (!context || !context_present(context)) {
3916 spin_unlock(&iommu->lock);
3917 return -ENODEV;
3918 }
3919 did = context_domain_id(context);
3920
3921 if (enable)
3922 context_set_sm_pre(context);
3923 else
3924 context_clear_sm_pre(context);
3925
3926 if (!ecap_coherent(iommu->ecap))
3927 clflush_cache_range(context, sizeof(*context));
3928 intel_context_flush_present(info, context, did, true);
3929 spin_unlock(&iommu->lock);
3930
3931 return 0;
3932 }
3933
intel_iommu_enable_iopf(struct device * dev)3934 static int intel_iommu_enable_iopf(struct device *dev)
3935 {
3936 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3937 struct device_domain_info *info = dev_iommu_priv_get(dev);
3938 struct intel_iommu *iommu;
3939 int ret;
3940
3941 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
3942 return -ENODEV;
3943
3944 if (info->pri_enabled)
3945 return -EBUSY;
3946
3947 iommu = info->iommu;
3948 if (!iommu)
3949 return -EINVAL;
3950
3951 /* PASID is required in PRG Response Message. */
3952 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
3953 return -EINVAL;
3954
3955 ret = pci_reset_pri(pdev);
3956 if (ret)
3957 return ret;
3958
3959 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3960 if (ret)
3961 return ret;
3962
3963 ret = context_flip_pri(info, true);
3964 if (ret)
3965 goto err_remove_device;
3966
3967 ret = pci_enable_pri(pdev, PRQ_DEPTH);
3968 if (ret)
3969 goto err_clear_pri;
3970
3971 info->pri_enabled = 1;
3972
3973 return 0;
3974 err_clear_pri:
3975 context_flip_pri(info, false);
3976 err_remove_device:
3977 iopf_queue_remove_device(iommu->iopf_queue, dev);
3978
3979 return ret;
3980 }
3981
intel_iommu_disable_iopf(struct device * dev)3982 static int intel_iommu_disable_iopf(struct device *dev)
3983 {
3984 struct device_domain_info *info = dev_iommu_priv_get(dev);
3985 struct intel_iommu *iommu = info->iommu;
3986
3987 if (!info->pri_enabled)
3988 return -EINVAL;
3989
3990 /* Disable new PRI reception: */
3991 context_flip_pri(info, false);
3992
3993 /*
3994 * Remove device from fault queue and acknowledge all outstanding
3995 * PRQs to the device:
3996 */
3997 iopf_queue_remove_device(iommu->iopf_queue, dev);
3998
3999 /*
4000 * PCIe spec states that by clearing PRI enable bit, the Page
4001 * Request Interface will not issue new page requests, but has
4002 * outstanding page requests that have been transmitted or are
4003 * queued for transmission. This is supposed to be called after
4004 * the device driver has stopped DMA, all PASIDs have been
4005 * unbound and the outstanding PRQs have been drained.
4006 */
4007 pci_disable_pri(to_pci_dev(dev));
4008 info->pri_enabled = 0;
4009
4010 return 0;
4011 }
4012
4013 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)4014 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4015 {
4016 switch (feat) {
4017 case IOMMU_DEV_FEAT_IOPF:
4018 return intel_iommu_enable_iopf(dev);
4019
4020 case IOMMU_DEV_FEAT_SVA:
4021 return intel_iommu_enable_sva(dev);
4022
4023 default:
4024 return -ENODEV;
4025 }
4026 }
4027
4028 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)4029 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4030 {
4031 switch (feat) {
4032 case IOMMU_DEV_FEAT_IOPF:
4033 return intel_iommu_disable_iopf(dev);
4034
4035 case IOMMU_DEV_FEAT_SVA:
4036 return 0;
4037
4038 default:
4039 return -ENODEV;
4040 }
4041 }
4042
intel_iommu_is_attach_deferred(struct device * dev)4043 static bool intel_iommu_is_attach_deferred(struct device *dev)
4044 {
4045 struct device_domain_info *info = dev_iommu_priv_get(dev);
4046
4047 return translation_pre_enabled(info->iommu) && !info->domain;
4048 }
4049
4050 /*
4051 * Check that the device does not live on an external facing PCI port that is
4052 * marked as untrusted. Such devices should not be able to apply quirks and
4053 * thus not be able to bypass the IOMMU restrictions.
4054 */
risky_device(struct pci_dev * pdev)4055 static bool risky_device(struct pci_dev *pdev)
4056 {
4057 if (pdev->untrusted) {
4058 pci_info(pdev,
4059 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4060 pdev->vendor, pdev->device);
4061 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4062 return true;
4063 }
4064 return false;
4065 }
4066
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4067 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4068 unsigned long iova, size_t size)
4069 {
4070 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4071
4072 return 0;
4073 }
4074
domain_remove_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4075 void domain_remove_dev_pasid(struct iommu_domain *domain,
4076 struct device *dev, ioasid_t pasid)
4077 {
4078 struct device_domain_info *info = dev_iommu_priv_get(dev);
4079 struct dev_pasid_info *curr, *dev_pasid = NULL;
4080 struct intel_iommu *iommu = info->iommu;
4081 struct dmar_domain *dmar_domain;
4082 unsigned long flags;
4083
4084 if (!domain)
4085 return;
4086
4087 /* Identity domain has no meta data for pasid. */
4088 if (domain->type == IOMMU_DOMAIN_IDENTITY)
4089 return;
4090
4091 dmar_domain = to_dmar_domain(domain);
4092 spin_lock_irqsave(&dmar_domain->lock, flags);
4093 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4094 if (curr->dev == dev && curr->pasid == pasid) {
4095 list_del(&curr->link_domain);
4096 dev_pasid = curr;
4097 break;
4098 }
4099 }
4100 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4101
4102 cache_tag_unassign_domain(dmar_domain, dev, pasid);
4103 domain_detach_iommu(dmar_domain, iommu);
4104 if (!WARN_ON_ONCE(!dev_pasid)) {
4105 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4106 kfree(dev_pasid);
4107 }
4108 }
4109
blocking_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4110 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
4111 struct device *dev, ioasid_t pasid,
4112 struct iommu_domain *old)
4113 {
4114 struct device_domain_info *info = dev_iommu_priv_get(dev);
4115
4116 intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4117 domain_remove_dev_pasid(old, dev, pasid);
4118
4119 return 0;
4120 }
4121
4122 struct dev_pasid_info *
domain_add_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4123 domain_add_dev_pasid(struct iommu_domain *domain,
4124 struct device *dev, ioasid_t pasid)
4125 {
4126 struct device_domain_info *info = dev_iommu_priv_get(dev);
4127 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4128 struct intel_iommu *iommu = info->iommu;
4129 struct dev_pasid_info *dev_pasid;
4130 unsigned long flags;
4131 int ret;
4132
4133 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4134 if (!dev_pasid)
4135 return ERR_PTR(-ENOMEM);
4136
4137 ret = domain_attach_iommu(dmar_domain, iommu);
4138 if (ret)
4139 goto out_free;
4140
4141 ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4142 if (ret)
4143 goto out_detach_iommu;
4144
4145 dev_pasid->dev = dev;
4146 dev_pasid->pasid = pasid;
4147 spin_lock_irqsave(&dmar_domain->lock, flags);
4148 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4149 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4150
4151 return dev_pasid;
4152 out_detach_iommu:
4153 domain_detach_iommu(dmar_domain, iommu);
4154 out_free:
4155 kfree(dev_pasid);
4156 return ERR_PTR(ret);
4157 }
4158
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4159 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4160 struct device *dev, ioasid_t pasid,
4161 struct iommu_domain *old)
4162 {
4163 struct device_domain_info *info = dev_iommu_priv_get(dev);
4164 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4165 struct intel_iommu *iommu = info->iommu;
4166 struct dev_pasid_info *dev_pasid;
4167 int ret;
4168
4169 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4170 return -EINVAL;
4171
4172 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4173 return -EOPNOTSUPP;
4174
4175 if (domain->dirty_ops)
4176 return -EINVAL;
4177
4178 if (context_copied(iommu, info->bus, info->devfn))
4179 return -EBUSY;
4180
4181 ret = paging_domain_compatible(domain, dev);
4182 if (ret)
4183 return ret;
4184
4185 dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4186 if (IS_ERR(dev_pasid))
4187 return PTR_ERR(dev_pasid);
4188
4189 if (dmar_domain->use_first_level)
4190 ret = domain_setup_first_level(iommu, dmar_domain,
4191 dev, pasid, old);
4192 else
4193 ret = domain_setup_second_level(iommu, dmar_domain,
4194 dev, pasid, old);
4195 if (ret)
4196 goto out_remove_dev_pasid;
4197
4198 domain_remove_dev_pasid(old, dev, pasid);
4199
4200 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4201
4202 return 0;
4203
4204 out_remove_dev_pasid:
4205 domain_remove_dev_pasid(domain, dev, pasid);
4206 return ret;
4207 }
4208
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4209 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4210 {
4211 struct device_domain_info *info = dev_iommu_priv_get(dev);
4212 struct intel_iommu *iommu = info->iommu;
4213 struct iommu_hw_info_vtd *vtd;
4214
4215 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4216 if (!vtd)
4217 return ERR_PTR(-ENOMEM);
4218
4219 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4220 vtd->cap_reg = iommu->cap;
4221 vtd->ecap_reg = iommu->ecap;
4222 *length = sizeof(*vtd);
4223 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4224 return vtd;
4225 }
4226
4227 /*
4228 * Set dirty tracking for the device list of a domain. The caller must
4229 * hold the domain->lock when calling it.
4230 */
device_set_dirty_tracking(struct list_head * devices,bool enable)4231 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4232 {
4233 struct device_domain_info *info;
4234 int ret = 0;
4235
4236 list_for_each_entry(info, devices, link) {
4237 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4238 IOMMU_NO_PASID, enable);
4239 if (ret)
4240 break;
4241 }
4242
4243 return ret;
4244 }
4245
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4246 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4247 bool enable)
4248 {
4249 struct dmar_domain *s1_domain;
4250 unsigned long flags;
4251 int ret;
4252
4253 spin_lock(&domain->s1_lock);
4254 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4255 spin_lock_irqsave(&s1_domain->lock, flags);
4256 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4257 spin_unlock_irqrestore(&s1_domain->lock, flags);
4258 if (ret)
4259 goto err_unwind;
4260 }
4261 spin_unlock(&domain->s1_lock);
4262 return 0;
4263
4264 err_unwind:
4265 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4266 spin_lock_irqsave(&s1_domain->lock, flags);
4267 device_set_dirty_tracking(&s1_domain->devices,
4268 domain->dirty_tracking);
4269 spin_unlock_irqrestore(&s1_domain->lock, flags);
4270 }
4271 spin_unlock(&domain->s1_lock);
4272 return ret;
4273 }
4274
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4275 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4276 bool enable)
4277 {
4278 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4279 int ret;
4280
4281 spin_lock(&dmar_domain->lock);
4282 if (dmar_domain->dirty_tracking == enable)
4283 goto out_unlock;
4284
4285 ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4286 if (ret)
4287 goto err_unwind;
4288
4289 if (dmar_domain->nested_parent) {
4290 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4291 if (ret)
4292 goto err_unwind;
4293 }
4294
4295 dmar_domain->dirty_tracking = enable;
4296 out_unlock:
4297 spin_unlock(&dmar_domain->lock);
4298
4299 return 0;
4300
4301 err_unwind:
4302 device_set_dirty_tracking(&dmar_domain->devices,
4303 dmar_domain->dirty_tracking);
4304 spin_unlock(&dmar_domain->lock);
4305 return ret;
4306 }
4307
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4308 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4309 unsigned long iova, size_t size,
4310 unsigned long flags,
4311 struct iommu_dirty_bitmap *dirty)
4312 {
4313 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4314 unsigned long end = iova + size - 1;
4315 unsigned long pgsize;
4316
4317 /*
4318 * IOMMUFD core calls into a dirty tracking disabled domain without an
4319 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4320 * have occurred when we stopped dirty tracking. This ensures that we
4321 * never inherit dirtied bits from a previous cycle.
4322 */
4323 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4324 return -EINVAL;
4325
4326 do {
4327 struct dma_pte *pte;
4328 int lvl = 0;
4329
4330 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4331 GFP_ATOMIC);
4332 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4333 if (!pte || !dma_pte_present(pte)) {
4334 iova += pgsize;
4335 continue;
4336 }
4337
4338 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4339 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4340 iova += pgsize;
4341 } while (iova < end);
4342
4343 return 0;
4344 }
4345
4346 static const struct iommu_dirty_ops intel_dirty_ops = {
4347 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4348 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4349 };
4350
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4351 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4352 {
4353 struct device_domain_info *info = dev_iommu_priv_get(dev);
4354 struct intel_iommu *iommu = info->iommu;
4355 struct context_entry *context;
4356
4357 spin_lock(&iommu->lock);
4358 context = iommu_context_addr(iommu, bus, devfn, 1);
4359 if (!context) {
4360 spin_unlock(&iommu->lock);
4361 return -ENOMEM;
4362 }
4363
4364 if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4365 spin_unlock(&iommu->lock);
4366 return 0;
4367 }
4368
4369 copied_context_tear_down(iommu, context, bus, devfn);
4370 context_clear_entry(context);
4371 context_set_domain_id(context, FLPT_DEFAULT_DID);
4372
4373 /*
4374 * In pass through mode, AW must be programmed to indicate the largest
4375 * AGAW value supported by hardware. And ASR is ignored by hardware.
4376 */
4377 context_set_address_width(context, iommu->msagaw);
4378 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4379 context_set_fault_enable(context);
4380 context_set_present(context);
4381 if (!ecap_coherent(iommu->ecap))
4382 clflush_cache_range(context, sizeof(*context));
4383 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4384 spin_unlock(&iommu->lock);
4385
4386 return 0;
4387 }
4388
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4389 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4390 {
4391 struct device *dev = data;
4392
4393 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4394 }
4395
device_setup_pass_through(struct device * dev)4396 static int device_setup_pass_through(struct device *dev)
4397 {
4398 struct device_domain_info *info = dev_iommu_priv_get(dev);
4399
4400 if (!dev_is_pci(dev))
4401 return context_setup_pass_through(dev, info->bus, info->devfn);
4402
4403 return pci_for_each_dma_alias(to_pci_dev(dev),
4404 context_setup_pass_through_cb, dev);
4405 }
4406
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4407 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4408 {
4409 struct device_domain_info *info = dev_iommu_priv_get(dev);
4410 struct intel_iommu *iommu = info->iommu;
4411 int ret;
4412
4413 device_block_translation(dev);
4414
4415 if (dev_is_real_dma_subdevice(dev))
4416 return 0;
4417
4418 if (sm_supported(iommu)) {
4419 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4420 if (!ret)
4421 iommu_enable_pci_caps(info);
4422 } else {
4423 ret = device_setup_pass_through(dev);
4424 }
4425
4426 return ret;
4427 }
4428
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4429 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4430 struct device *dev, ioasid_t pasid,
4431 struct iommu_domain *old)
4432 {
4433 struct device_domain_info *info = dev_iommu_priv_get(dev);
4434 struct intel_iommu *iommu = info->iommu;
4435 int ret;
4436
4437 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4438 return -EOPNOTSUPP;
4439
4440 ret = domain_setup_passthrough(iommu, dev, pasid, old);
4441 if (ret)
4442 return ret;
4443
4444 domain_remove_dev_pasid(old, dev, pasid);
4445 return 0;
4446 }
4447
4448 static struct iommu_domain identity_domain = {
4449 .type = IOMMU_DOMAIN_IDENTITY,
4450 .ops = &(const struct iommu_domain_ops) {
4451 .attach_dev = identity_domain_attach_dev,
4452 .set_dev_pasid = identity_domain_set_dev_pasid,
4453 },
4454 };
4455
4456 const struct iommu_ops intel_iommu_ops = {
4457 .blocked_domain = &blocking_domain,
4458 .release_domain = &blocking_domain,
4459 .identity_domain = &identity_domain,
4460 .capable = intel_iommu_capable,
4461 .hw_info = intel_iommu_hw_info,
4462 .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4463 .domain_alloc_sva = intel_svm_domain_alloc,
4464 .domain_alloc_nested = intel_iommu_domain_alloc_nested,
4465 .probe_device = intel_iommu_probe_device,
4466 .release_device = intel_iommu_release_device,
4467 .get_resv_regions = intel_iommu_get_resv_regions,
4468 .device_group = intel_iommu_device_group,
4469 .dev_enable_feat = intel_iommu_dev_enable_feat,
4470 .dev_disable_feat = intel_iommu_dev_disable_feat,
4471 .is_attach_deferred = intel_iommu_is_attach_deferred,
4472 .def_domain_type = device_def_domain_type,
4473 .pgsize_bitmap = SZ_4K,
4474 .page_response = intel_iommu_page_response,
4475 .default_domain_ops = &(const struct iommu_domain_ops) {
4476 .attach_dev = intel_iommu_attach_device,
4477 .set_dev_pasid = intel_iommu_set_dev_pasid,
4478 .map_pages = intel_iommu_map_pages,
4479 .unmap_pages = intel_iommu_unmap_pages,
4480 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4481 .flush_iotlb_all = intel_flush_iotlb_all,
4482 .iotlb_sync = intel_iommu_tlb_sync,
4483 .iova_to_phys = intel_iommu_iova_to_phys,
4484 .free = intel_iommu_domain_free,
4485 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4486 }
4487 };
4488
quirk_iommu_igfx(struct pci_dev * dev)4489 static void quirk_iommu_igfx(struct pci_dev *dev)
4490 {
4491 if (risky_device(dev))
4492 return;
4493
4494 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4495 disable_igfx_iommu = 1;
4496 }
4497
4498 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4499 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4500 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4501 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4502 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4503 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4504 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4505 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4506
4507 /* Broadwell igfx malfunctions with dmar */
4508 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4509 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4510 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4515 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4516 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4517 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4518 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4519 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4520 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4521 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4522 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4523 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4524 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4525 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4526 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4527 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4528 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4529 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4530 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4531 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4532
quirk_iommu_rwbf(struct pci_dev * dev)4533 static void quirk_iommu_rwbf(struct pci_dev *dev)
4534 {
4535 if (risky_device(dev))
4536 return;
4537
4538 /*
4539 * Mobile 4 Series Chipset neglects to set RWBF capability,
4540 * but needs it. Same seems to hold for the desktop versions.
4541 */
4542 pci_info(dev, "Forcing write-buffer flush capability\n");
4543 rwbf_quirk = 1;
4544 }
4545
4546 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4547 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4548 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4549 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4550 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4551 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4552 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4553
4554 #define GGC 0x52
4555 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4556 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4557 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4558 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4559 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4560 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4561 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4562 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4563
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4564 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4565 {
4566 unsigned short ggc;
4567
4568 if (risky_device(dev))
4569 return;
4570
4571 if (pci_read_config_word(dev, GGC, &ggc))
4572 return;
4573
4574 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4575 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4576 disable_igfx_iommu = 1;
4577 } else if (!disable_igfx_iommu) {
4578 /* we have to ensure the gfx device is idle before we flush */
4579 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4580 iommu_set_dma_strict();
4581 }
4582 }
4583 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4584 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4585 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4586 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4587
quirk_igfx_skip_te_disable(struct pci_dev * dev)4588 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4589 {
4590 unsigned short ver;
4591
4592 if (!IS_GFX_DEVICE(dev))
4593 return;
4594
4595 ver = (dev->device >> 8) & 0xff;
4596 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4597 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4598 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4599 return;
4600
4601 if (risky_device(dev))
4602 return;
4603
4604 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4605 iommu_skip_te_disable = 1;
4606 }
4607 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4608
4609 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4610 ISOCH DMAR unit for the Azalia sound device, but not give it any
4611 TLB entries, which causes it to deadlock. Check for that. We do
4612 this in a function called from init_dmars(), instead of in a PCI
4613 quirk, because we don't want to print the obnoxious "BIOS broken"
4614 message if VT-d is actually disabled.
4615 */
check_tylersburg_isoch(void)4616 static void __init check_tylersburg_isoch(void)
4617 {
4618 struct pci_dev *pdev;
4619 uint32_t vtisochctrl;
4620
4621 /* If there's no Azalia in the system anyway, forget it. */
4622 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4623 if (!pdev)
4624 return;
4625
4626 if (risky_device(pdev)) {
4627 pci_dev_put(pdev);
4628 return;
4629 }
4630
4631 pci_dev_put(pdev);
4632
4633 /* System Management Registers. Might be hidden, in which case
4634 we can't do the sanity check. But that's OK, because the
4635 known-broken BIOSes _don't_ actually hide it, so far. */
4636 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4637 if (!pdev)
4638 return;
4639
4640 if (risky_device(pdev)) {
4641 pci_dev_put(pdev);
4642 return;
4643 }
4644
4645 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4646 pci_dev_put(pdev);
4647 return;
4648 }
4649
4650 pci_dev_put(pdev);
4651
4652 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4653 if (vtisochctrl & 1)
4654 return;
4655
4656 /* Drop all bits other than the number of TLB entries */
4657 vtisochctrl &= 0x1c;
4658
4659 /* If we have the recommended number of TLB entries (16), fine. */
4660 if (vtisochctrl == 0x10)
4661 return;
4662
4663 /* Zero TLB entries? You get to ride the short bus to school. */
4664 if (!vtisochctrl) {
4665 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4666 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4667 dmi_get_system_info(DMI_BIOS_VENDOR),
4668 dmi_get_system_info(DMI_BIOS_VERSION),
4669 dmi_get_system_info(DMI_PRODUCT_VERSION));
4670 iommu_identity_mapping |= IDENTMAP_AZALIA;
4671 return;
4672 }
4673
4674 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4675 vtisochctrl);
4676 }
4677
4678 /*
4679 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4680 * invalidation completion before posted writes initiated with translated address
4681 * that utilized translations matching the invalidation address range, violating
4682 * the invalidation completion ordering.
4683 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4684 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4685 * under the control of the trusted/privileged host device driver must use this
4686 * quirk.
4687 * Device TLBs are invalidated under the following six conditions:
4688 * 1. Device driver does DMA API unmap IOVA
4689 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4690 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4691 * exit_mmap() due to crash
4692 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4693 * VM has to free pages that were unmapped
4694 * 5. Userspace driver unmaps a DMA buffer
4695 * 6. Cache invalidation in vSVA usage (upcoming)
4696 *
4697 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4698 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4699 * invalidate TLB the same way as normal user unmap which will use this quirk.
4700 * The dTLB invalidation after PASID cache flush does not need this quirk.
4701 *
4702 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4703 */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4704 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4705 unsigned long address, unsigned long mask,
4706 u32 pasid, u16 qdep)
4707 {
4708 u16 sid;
4709
4710 if (likely(!info->dtlb_extra_inval))
4711 return;
4712
4713 sid = PCI_DEVID(info->bus, info->devfn);
4714 if (pasid == IOMMU_NO_PASID) {
4715 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4716 qdep, address, mask);
4717 } else {
4718 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4719 pasid, qdep, address, mask);
4720 }
4721 }
4722
4723 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
4724
4725 /*
4726 * Function to submit a command to the enhanced command interface. The
4727 * valid enhanced command descriptions are defined in Table 47 of the
4728 * VT-d spec. The VT-d hardware implementation may support some but not
4729 * all commands, which can be determined by checking the Enhanced
4730 * Command Capability Register.
4731 *
4732 * Return values:
4733 * - 0: Command successful without any error;
4734 * - Negative: software error value;
4735 * - Nonzero positive: failure status code defined in Table 48.
4736 */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4737 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4738 {
4739 unsigned long flags;
4740 u64 res;
4741 int ret;
4742
4743 if (!cap_ecmds(iommu->cap))
4744 return -ENODEV;
4745
4746 raw_spin_lock_irqsave(&iommu->register_lock, flags);
4747
4748 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4749 if (res & DMA_ECMD_ECRSP_IP) {
4750 ret = -EBUSY;
4751 goto err;
4752 }
4753
4754 /*
4755 * Unconditionally write the operand B, because
4756 * - There is no side effect if an ecmd doesn't require an
4757 * operand B, but we set the register to some value.
4758 * - It's not invoked in any critical path. The extra MMIO
4759 * write doesn't bring any performance concerns.
4760 */
4761 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4762 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4763
4764 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4765 !(res & DMA_ECMD_ECRSP_IP), res);
4766
4767 if (res & DMA_ECMD_ECRSP_IP) {
4768 ret = -ETIMEDOUT;
4769 goto err;
4770 }
4771
4772 ret = ecmd_get_status_code(res);
4773 err:
4774 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4775
4776 return ret;
4777 }
4778