1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright(c) 2023 Intel Corporation.
4 *
5 * Intel Trusted Domain Extensions (TDX) support
6 */
7
8 #define pr_fmt(fmt) "virt/tdx: " fmt
9
10 #include <linux/types.h>
11 #include <linux/cache.h>
12 #include <linux/init.h>
13 #include <linux/errno.h>
14 #include <linux/printk.h>
15 #include <linux/cpu.h>
16 #include <linux/spinlock.h>
17 #include <linux/percpu-defs.h>
18 #include <linux/mutex.h>
19 #include <linux/list.h>
20 #include <linux/memblock.h>
21 #include <linux/memory.h>
22 #include <linux/minmax.h>
23 #include <linux/sizes.h>
24 #include <linux/pfn.h>
25 #include <linux/align.h>
26 #include <linux/sort.h>
27 #include <linux/log2.h>
28 #include <linux/acpi.h>
29 #include <linux/suspend.h>
30 #include <asm/page.h>
31 #include <asm/special_insns.h>
32 #include <asm/msr-index.h>
33 #include <asm/msr.h>
34 #include <asm/cpufeature.h>
35 #include <asm/tdx.h>
36 #include <asm/cpu_device_id.h>
37 #include <asm/processor.h>
38 #include <asm/mce.h>
39 #include "tdx.h"
40
41 static u32 tdx_global_keyid __ro_after_init;
42 static u32 tdx_guest_keyid_start __ro_after_init;
43 static u32 tdx_nr_guest_keyids __ro_after_init;
44
45 static DEFINE_PER_CPU(bool, tdx_lp_initialized);
46
47 static struct tdmr_info_list tdx_tdmr_list;
48
49 static enum tdx_module_status_t tdx_module_status;
50 static DEFINE_MUTEX(tdx_module_lock);
51
52 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */
53 static LIST_HEAD(tdx_memlist);
54
55 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
56
seamcall_err(u64 fn,u64 err,struct tdx_module_args * args)57 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
58 {
59 pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
60 }
61
seamcall_err_ret(u64 fn,u64 err,struct tdx_module_args * args)62 static inline void seamcall_err_ret(u64 fn, u64 err,
63 struct tdx_module_args *args)
64 {
65 seamcall_err(fn, err, args);
66 pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
67 args->rcx, args->rdx, args->r8);
68 pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
69 args->r9, args->r10, args->r11);
70 }
71
sc_retry_prerr(sc_func_t func,sc_err_func_t err_func,u64 fn,struct tdx_module_args * args)72 static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
73 u64 fn, struct tdx_module_args *args)
74 {
75 u64 sret = sc_retry(func, fn, args);
76
77 if (sret == TDX_SUCCESS)
78 return 0;
79
80 if (sret == TDX_SEAMCALL_VMFAILINVALID)
81 return -ENODEV;
82
83 if (sret == TDX_SEAMCALL_GP)
84 return -EOPNOTSUPP;
85
86 if (sret == TDX_SEAMCALL_UD)
87 return -EACCES;
88
89 err_func(fn, sret, args);
90 return -EIO;
91 }
92
93 #define seamcall_prerr(__fn, __args) \
94 sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
95
96 #define seamcall_prerr_ret(__fn, __args) \
97 sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
98
99 /*
100 * Do the module global initialization once and return its result.
101 * It can be done on any cpu. It's always called with interrupts
102 * disabled.
103 */
try_init_module_global(void)104 static int try_init_module_global(void)
105 {
106 struct tdx_module_args args = {};
107 static DEFINE_RAW_SPINLOCK(sysinit_lock);
108 static bool sysinit_done;
109 static int sysinit_ret;
110
111 lockdep_assert_irqs_disabled();
112
113 raw_spin_lock(&sysinit_lock);
114
115 if (sysinit_done)
116 goto out;
117
118 /* RCX is module attributes and all bits are reserved */
119 args.rcx = 0;
120 sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
121
122 /*
123 * The first SEAMCALL also detects the TDX module, thus
124 * it can fail due to the TDX module is not loaded.
125 * Dump message to let the user know.
126 */
127 if (sysinit_ret == -ENODEV)
128 pr_err("module not loaded\n");
129
130 sysinit_done = true;
131 out:
132 raw_spin_unlock(&sysinit_lock);
133 return sysinit_ret;
134 }
135
136 /**
137 * tdx_cpu_enable - Enable TDX on local cpu
138 *
139 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
140 * global initialization SEAMCALL if not done) on local cpu to make this
141 * cpu be ready to run any other SEAMCALLs.
142 *
143 * Always call this function via IPI function calls.
144 *
145 * Return 0 on success, otherwise errors.
146 */
tdx_cpu_enable(void)147 int tdx_cpu_enable(void)
148 {
149 struct tdx_module_args args = {};
150 int ret;
151
152 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
153 return -ENODEV;
154
155 lockdep_assert_irqs_disabled();
156
157 if (__this_cpu_read(tdx_lp_initialized))
158 return 0;
159
160 /*
161 * The TDX module global initialization is the very first step
162 * to enable TDX. Need to do it first (if hasn't been done)
163 * before the per-cpu initialization.
164 */
165 ret = try_init_module_global();
166 if (ret)
167 return ret;
168
169 ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
170 if (ret)
171 return ret;
172
173 __this_cpu_write(tdx_lp_initialized, true);
174
175 return 0;
176 }
177 EXPORT_SYMBOL_GPL(tdx_cpu_enable);
178
179 /*
180 * Add a memory region as a TDX memory block. The caller must make sure
181 * all memory regions are added in address ascending order and don't
182 * overlap.
183 */
add_tdx_memblock(struct list_head * tmb_list,unsigned long start_pfn,unsigned long end_pfn,int nid)184 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
185 unsigned long end_pfn, int nid)
186 {
187 struct tdx_memblock *tmb;
188
189 tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
190 if (!tmb)
191 return -ENOMEM;
192
193 INIT_LIST_HEAD(&tmb->list);
194 tmb->start_pfn = start_pfn;
195 tmb->end_pfn = end_pfn;
196 tmb->nid = nid;
197
198 /* @tmb_list is protected by mem_hotplug_lock */
199 list_add_tail(&tmb->list, tmb_list);
200 return 0;
201 }
202
free_tdx_memlist(struct list_head * tmb_list)203 static void free_tdx_memlist(struct list_head *tmb_list)
204 {
205 /* @tmb_list is protected by mem_hotplug_lock */
206 while (!list_empty(tmb_list)) {
207 struct tdx_memblock *tmb = list_first_entry(tmb_list,
208 struct tdx_memblock, list);
209
210 list_del(&tmb->list);
211 kfree(tmb);
212 }
213 }
214
215 /*
216 * Ensure that all memblock memory regions are convertible to TDX
217 * memory. Once this has been established, stash the memblock
218 * ranges off in a secondary structure because memblock is modified
219 * in memory hotplug while TDX memory regions are fixed.
220 */
build_tdx_memlist(struct list_head * tmb_list)221 static int build_tdx_memlist(struct list_head *tmb_list)
222 {
223 unsigned long start_pfn, end_pfn;
224 int i, nid, ret;
225
226 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
227 /*
228 * The first 1MB is not reported as TDX convertible memory.
229 * Although the first 1MB is always reserved and won't end up
230 * to the page allocator, it is still in memblock's memory
231 * regions. Skip them manually to exclude them as TDX memory.
232 */
233 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
234 if (start_pfn >= end_pfn)
235 continue;
236
237 /*
238 * Add the memory regions as TDX memory. The regions in
239 * memblock has already guaranteed they are in address
240 * ascending order and don't overlap.
241 */
242 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
243 if (ret)
244 goto err;
245 }
246
247 return 0;
248 err:
249 free_tdx_memlist(tmb_list);
250 return ret;
251 }
252
read_sys_metadata_field(u64 field_id,u64 * data)253 static int read_sys_metadata_field(u64 field_id, u64 *data)
254 {
255 struct tdx_module_args args = {};
256 int ret;
257
258 /*
259 * TDH.SYS.RD -- reads one global metadata field
260 * - RDX (in): the field to read
261 * - R8 (out): the field data
262 */
263 args.rdx = field_id;
264 ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
265 if (ret)
266 return ret;
267
268 *data = args.r8;
269
270 return 0;
271 }
272
273 #include "tdx_global_metadata.c"
274
check_features(struct tdx_sys_info * sysinfo)275 static int check_features(struct tdx_sys_info *sysinfo)
276 {
277 u64 tdx_features0 = sysinfo->features.tdx_features0;
278
279 if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) {
280 pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n");
281 return -EINVAL;
282 }
283
284 return 0;
285 }
286
287 /* Calculate the actual TDMR size */
tdmr_size_single(u16 max_reserved_per_tdmr)288 static int tdmr_size_single(u16 max_reserved_per_tdmr)
289 {
290 int tdmr_sz;
291
292 /*
293 * The actual size of TDMR depends on the maximum
294 * number of reserved areas.
295 */
296 tdmr_sz = sizeof(struct tdmr_info);
297 tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
298
299 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
300 }
301
alloc_tdmr_list(struct tdmr_info_list * tdmr_list,struct tdx_sys_info_tdmr * sysinfo_tdmr)302 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
303 struct tdx_sys_info_tdmr *sysinfo_tdmr)
304 {
305 size_t tdmr_sz, tdmr_array_sz;
306 void *tdmr_array;
307
308 tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr);
309 tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs;
310
311 /*
312 * To keep things simple, allocate all TDMRs together.
313 * The buffer needs to be physically contiguous to make
314 * sure each TDMR is physically contiguous.
315 */
316 tdmr_array = alloc_pages_exact(tdmr_array_sz,
317 GFP_KERNEL | __GFP_ZERO);
318 if (!tdmr_array)
319 return -ENOMEM;
320
321 tdmr_list->tdmrs = tdmr_array;
322
323 /*
324 * Keep the size of TDMR to find the target TDMR
325 * at a given index in the TDMR list.
326 */
327 tdmr_list->tdmr_sz = tdmr_sz;
328 tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs;
329 tdmr_list->nr_consumed_tdmrs = 0;
330
331 return 0;
332 }
333
free_tdmr_list(struct tdmr_info_list * tdmr_list)334 static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
335 {
336 free_pages_exact(tdmr_list->tdmrs,
337 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
338 }
339
340 /* Get the TDMR from the list at the given index. */
tdmr_entry(struct tdmr_info_list * tdmr_list,int idx)341 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
342 int idx)
343 {
344 int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
345
346 return (void *)tdmr_list->tdmrs + tdmr_info_offset;
347 }
348
349 #define TDMR_ALIGNMENT SZ_1G
350 #define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
351 #define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT)
352
tdmr_end(struct tdmr_info * tdmr)353 static inline u64 tdmr_end(struct tdmr_info *tdmr)
354 {
355 return tdmr->base + tdmr->size;
356 }
357
358 /*
359 * Take the memory referenced in @tmb_list and populate the
360 * preallocated @tdmr_list, following all the special alignment
361 * and size rules for TDMR.
362 */
fill_out_tdmrs(struct list_head * tmb_list,struct tdmr_info_list * tdmr_list)363 static int fill_out_tdmrs(struct list_head *tmb_list,
364 struct tdmr_info_list *tdmr_list)
365 {
366 struct tdx_memblock *tmb;
367 int tdmr_idx = 0;
368
369 /*
370 * Loop over TDX memory regions and fill out TDMRs to cover them.
371 * To keep it simple, always try to use one TDMR to cover one
372 * memory region.
373 *
374 * In practice TDX supports at least 64 TDMRs. A 2-socket system
375 * typically only consumes less than 10 of those. This code is
376 * dumb and simple and may use more TMDRs than is strictly
377 * required.
378 */
379 list_for_each_entry(tmb, tmb_list, list) {
380 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
381 u64 start, end;
382
383 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
384 end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
385
386 /*
387 * A valid size indicates the current TDMR has already
388 * been filled out to cover the previous memory region(s).
389 */
390 if (tdmr->size) {
391 /*
392 * Loop to the next if the current memory region
393 * has already been fully covered.
394 */
395 if (end <= tdmr_end(tdmr))
396 continue;
397
398 /* Otherwise, skip the already covered part. */
399 if (start < tdmr_end(tdmr))
400 start = tdmr_end(tdmr);
401
402 /*
403 * Create a new TDMR to cover the current memory
404 * region, or the remaining part of it.
405 */
406 tdmr_idx++;
407 if (tdmr_idx >= tdmr_list->max_tdmrs) {
408 pr_warn("initialization failed: TDMRs exhausted.\n");
409 return -ENOSPC;
410 }
411
412 tdmr = tdmr_entry(tdmr_list, tdmr_idx);
413 }
414
415 tdmr->base = start;
416 tdmr->size = end - start;
417 }
418
419 /* @tdmr_idx is always the index of the last valid TDMR. */
420 tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
421
422 /*
423 * Warn early that kernel is about to run out of TDMRs.
424 *
425 * This is an indication that TDMR allocation has to be
426 * reworked to be smarter to not run into an issue.
427 */
428 if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
429 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
430 tdmr_list->nr_consumed_tdmrs,
431 tdmr_list->max_tdmrs);
432
433 return 0;
434 }
435
436 /*
437 * Calculate PAMT size given a TDMR and a page size. The returned
438 * PAMT size is always aligned up to 4K page boundary.
439 */
tdmr_get_pamt_sz(struct tdmr_info * tdmr,int pgsz,u16 pamt_entry_size)440 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
441 u16 pamt_entry_size)
442 {
443 unsigned long pamt_sz, nr_pamt_entries;
444
445 switch (pgsz) {
446 case TDX_PS_4K:
447 nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
448 break;
449 case TDX_PS_2M:
450 nr_pamt_entries = tdmr->size >> PMD_SHIFT;
451 break;
452 case TDX_PS_1G:
453 nr_pamt_entries = tdmr->size >> PUD_SHIFT;
454 break;
455 default:
456 WARN_ON_ONCE(1);
457 return 0;
458 }
459
460 pamt_sz = nr_pamt_entries * pamt_entry_size;
461 /* TDX requires PAMT size must be 4K aligned */
462 pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
463
464 return pamt_sz;
465 }
466
467 /*
468 * Locate a NUMA node which should hold the allocation of the @tdmr
469 * PAMT. This node will have some memory covered by the TDMR. The
470 * relative amount of memory covered is not considered.
471 */
tdmr_get_nid(struct tdmr_info * tdmr,struct list_head * tmb_list)472 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
473 {
474 struct tdx_memblock *tmb;
475
476 /*
477 * A TDMR must cover at least part of one TMB. That TMB will end
478 * after the TDMR begins. But, that TMB may have started before
479 * the TDMR. Find the next 'tmb' that _ends_ after this TDMR
480 * begins. Ignore 'tmb' start addresses. They are irrelevant.
481 */
482 list_for_each_entry(tmb, tmb_list, list) {
483 if (tmb->end_pfn > PHYS_PFN(tdmr->base))
484 return tmb->nid;
485 }
486
487 /*
488 * Fall back to allocating the TDMR's metadata from node 0 when
489 * no TDX memory block can be found. This should never happen
490 * since TDMRs originate from TDX memory blocks.
491 */
492 pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
493 tdmr->base, tdmr_end(tdmr));
494 return 0;
495 }
496
497 /*
498 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
499 * within @tdmr, and set up PAMTs for @tdmr.
500 */
tdmr_set_up_pamt(struct tdmr_info * tdmr,struct list_head * tmb_list,u16 pamt_entry_size[])501 static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
502 struct list_head *tmb_list,
503 u16 pamt_entry_size[])
504 {
505 unsigned long pamt_base[TDX_PS_NR];
506 unsigned long pamt_size[TDX_PS_NR];
507 unsigned long tdmr_pamt_base;
508 unsigned long tdmr_pamt_size;
509 struct page *pamt;
510 int pgsz, nid;
511
512 nid = tdmr_get_nid(tdmr, tmb_list);
513
514 /*
515 * Calculate the PAMT size for each TDX supported page size
516 * and the total PAMT size.
517 */
518 tdmr_pamt_size = 0;
519 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
520 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
521 pamt_entry_size[pgsz]);
522 tdmr_pamt_size += pamt_size[pgsz];
523 }
524
525 /*
526 * Allocate one chunk of physically contiguous memory for all
527 * PAMTs. This helps minimize the PAMT's use of reserved areas
528 * in overlapped TDMRs.
529 */
530 pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
531 nid, &node_online_map);
532 if (!pamt)
533 return -ENOMEM;
534
535 /*
536 * Break the contiguous allocation back up into the
537 * individual PAMTs for each page size.
538 */
539 tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
540 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
541 pamt_base[pgsz] = tdmr_pamt_base;
542 tdmr_pamt_base += pamt_size[pgsz];
543 }
544
545 tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
546 tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
547 tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
548 tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
549 tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
550 tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
551
552 return 0;
553 }
554
tdmr_get_pamt(struct tdmr_info * tdmr,unsigned long * pamt_base,unsigned long * pamt_size)555 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
556 unsigned long *pamt_size)
557 {
558 unsigned long pamt_bs, pamt_sz;
559
560 /*
561 * The PAMT was allocated in one contiguous unit. The 4K PAMT
562 * should always point to the beginning of that allocation.
563 */
564 pamt_bs = tdmr->pamt_4k_base;
565 pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
566
567 WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
568
569 *pamt_base = pamt_bs;
570 *pamt_size = pamt_sz;
571 }
572
tdmr_do_pamt_func(struct tdmr_info * tdmr,void (* pamt_func)(unsigned long base,unsigned long size))573 static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
574 void (*pamt_func)(unsigned long base, unsigned long size))
575 {
576 unsigned long pamt_base, pamt_size;
577
578 tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
579
580 /* Do nothing if PAMT hasn't been allocated for this TDMR */
581 if (!pamt_size)
582 return;
583
584 if (WARN_ON_ONCE(!pamt_base))
585 return;
586
587 pamt_func(pamt_base, pamt_size);
588 }
589
free_pamt(unsigned long pamt_base,unsigned long pamt_size)590 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
591 {
592 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
593 }
594
tdmr_free_pamt(struct tdmr_info * tdmr)595 static void tdmr_free_pamt(struct tdmr_info *tdmr)
596 {
597 tdmr_do_pamt_func(tdmr, free_pamt);
598 }
599
tdmrs_free_pamt_all(struct tdmr_info_list * tdmr_list)600 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
601 {
602 int i;
603
604 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
605 tdmr_free_pamt(tdmr_entry(tdmr_list, i));
606 }
607
608 /* Allocate and set up PAMTs for all TDMRs */
tdmrs_set_up_pamt_all(struct tdmr_info_list * tdmr_list,struct list_head * tmb_list,u16 pamt_entry_size[])609 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
610 struct list_head *tmb_list,
611 u16 pamt_entry_size[])
612 {
613 int i, ret = 0;
614
615 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
616 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
617 pamt_entry_size);
618 if (ret)
619 goto err;
620 }
621
622 return 0;
623 err:
624 tdmrs_free_pamt_all(tdmr_list);
625 return ret;
626 }
627
628 /*
629 * Convert TDX private pages back to normal by using MOVDIR64B to
630 * clear these pages. Note this function doesn't flush cache of
631 * these TDX private pages. The caller should make sure of that.
632 */
reset_tdx_pages(unsigned long base,unsigned long size)633 static void reset_tdx_pages(unsigned long base, unsigned long size)
634 {
635 const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
636 unsigned long phys, end;
637
638 end = base + size;
639 for (phys = base; phys < end; phys += 64)
640 movdir64b(__va(phys), zero_page);
641
642 /*
643 * MOVDIR64B uses WC protocol. Use memory barrier to
644 * make sure any later user of these pages sees the
645 * updated data.
646 */
647 mb();
648 }
649
tdmr_reset_pamt(struct tdmr_info * tdmr)650 static void tdmr_reset_pamt(struct tdmr_info *tdmr)
651 {
652 tdmr_do_pamt_func(tdmr, reset_tdx_pages);
653 }
654
tdmrs_reset_pamt_all(struct tdmr_info_list * tdmr_list)655 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
656 {
657 int i;
658
659 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
660 tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
661 }
662
tdmrs_count_pamt_kb(struct tdmr_info_list * tdmr_list)663 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
664 {
665 unsigned long pamt_size = 0;
666 int i;
667
668 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
669 unsigned long base, size;
670
671 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
672 pamt_size += size;
673 }
674
675 return pamt_size / 1024;
676 }
677
tdmr_add_rsvd_area(struct tdmr_info * tdmr,int * p_idx,u64 addr,u64 size,u16 max_reserved_per_tdmr)678 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
679 u64 size, u16 max_reserved_per_tdmr)
680 {
681 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
682 int idx = *p_idx;
683
684 /* Reserved area must be 4K aligned in offset and size */
685 if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
686 return -EINVAL;
687
688 if (idx >= max_reserved_per_tdmr) {
689 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
690 tdmr->base, tdmr_end(tdmr));
691 return -ENOSPC;
692 }
693
694 /*
695 * Consume one reserved area per call. Make no effort to
696 * optimize or reduce the number of reserved areas which are
697 * consumed by contiguous reserved areas, for instance.
698 */
699 rsvd_areas[idx].offset = addr - tdmr->base;
700 rsvd_areas[idx].size = size;
701
702 *p_idx = idx + 1;
703
704 return 0;
705 }
706
707 /*
708 * Go through @tmb_list to find holes between memory areas. If any of
709 * those holes fall within @tdmr, set up a TDMR reserved area to cover
710 * the hole.
711 */
tdmr_populate_rsvd_holes(struct list_head * tmb_list,struct tdmr_info * tdmr,int * rsvd_idx,u16 max_reserved_per_tdmr)712 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
713 struct tdmr_info *tdmr,
714 int *rsvd_idx,
715 u16 max_reserved_per_tdmr)
716 {
717 struct tdx_memblock *tmb;
718 u64 prev_end;
719 int ret;
720
721 /*
722 * Start looking for reserved blocks at the
723 * beginning of the TDMR.
724 */
725 prev_end = tdmr->base;
726 list_for_each_entry(tmb, tmb_list, list) {
727 u64 start, end;
728
729 start = PFN_PHYS(tmb->start_pfn);
730 end = PFN_PHYS(tmb->end_pfn);
731
732 /* Break if this region is after the TDMR */
733 if (start >= tdmr_end(tdmr))
734 break;
735
736 /* Exclude regions before this TDMR */
737 if (end < tdmr->base)
738 continue;
739
740 /*
741 * Skip over memory areas that
742 * have already been dealt with.
743 */
744 if (start <= prev_end) {
745 prev_end = end;
746 continue;
747 }
748
749 /* Add the hole before this region */
750 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
751 start - prev_end,
752 max_reserved_per_tdmr);
753 if (ret)
754 return ret;
755
756 prev_end = end;
757 }
758
759 /* Add the hole after the last region if it exists. */
760 if (prev_end < tdmr_end(tdmr)) {
761 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
762 tdmr_end(tdmr) - prev_end,
763 max_reserved_per_tdmr);
764 if (ret)
765 return ret;
766 }
767
768 return 0;
769 }
770
771 /*
772 * Go through @tdmr_list to find all PAMTs. If any of those PAMTs
773 * overlaps with @tdmr, set up a TDMR reserved area to cover the
774 * overlapping part.
775 */
tdmr_populate_rsvd_pamts(struct tdmr_info_list * tdmr_list,struct tdmr_info * tdmr,int * rsvd_idx,u16 max_reserved_per_tdmr)776 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
777 struct tdmr_info *tdmr,
778 int *rsvd_idx,
779 u16 max_reserved_per_tdmr)
780 {
781 int i, ret;
782
783 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
784 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
785 unsigned long pamt_base, pamt_size, pamt_end;
786
787 tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
788 /* Each TDMR must already have PAMT allocated */
789 WARN_ON_ONCE(!pamt_size || !pamt_base);
790
791 pamt_end = pamt_base + pamt_size;
792 /* Skip PAMTs outside of the given TDMR */
793 if ((pamt_end <= tdmr->base) ||
794 (pamt_base >= tdmr_end(tdmr)))
795 continue;
796
797 /* Only mark the part within the TDMR as reserved */
798 if (pamt_base < tdmr->base)
799 pamt_base = tdmr->base;
800 if (pamt_end > tdmr_end(tdmr))
801 pamt_end = tdmr_end(tdmr);
802
803 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
804 pamt_end - pamt_base,
805 max_reserved_per_tdmr);
806 if (ret)
807 return ret;
808 }
809
810 return 0;
811 }
812
813 /* Compare function called by sort() for TDMR reserved areas */
rsvd_area_cmp_func(const void * a,const void * b)814 static int rsvd_area_cmp_func(const void *a, const void *b)
815 {
816 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
817 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
818
819 if (r1->offset + r1->size <= r2->offset)
820 return -1;
821 if (r1->offset >= r2->offset + r2->size)
822 return 1;
823
824 /* Reserved areas cannot overlap. The caller must guarantee. */
825 WARN_ON_ONCE(1);
826 return -1;
827 }
828
829 /*
830 * Populate reserved areas for the given @tdmr, including memory holes
831 * (via @tmb_list) and PAMTs (via @tdmr_list).
832 */
tdmr_populate_rsvd_areas(struct tdmr_info * tdmr,struct list_head * tmb_list,struct tdmr_info_list * tdmr_list,u16 max_reserved_per_tdmr)833 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
834 struct list_head *tmb_list,
835 struct tdmr_info_list *tdmr_list,
836 u16 max_reserved_per_tdmr)
837 {
838 int ret, rsvd_idx = 0;
839
840 ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
841 max_reserved_per_tdmr);
842 if (ret)
843 return ret;
844
845 ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
846 max_reserved_per_tdmr);
847 if (ret)
848 return ret;
849
850 /* TDX requires reserved areas listed in address ascending order */
851 sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
852 rsvd_area_cmp_func, NULL);
853
854 return 0;
855 }
856
857 /*
858 * Populate reserved areas for all TDMRs in @tdmr_list, including memory
859 * holes (via @tmb_list) and PAMTs.
860 */
tdmrs_populate_rsvd_areas_all(struct tdmr_info_list * tdmr_list,struct list_head * tmb_list,u16 max_reserved_per_tdmr)861 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
862 struct list_head *tmb_list,
863 u16 max_reserved_per_tdmr)
864 {
865 int i;
866
867 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
868 int ret;
869
870 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
871 tmb_list, tdmr_list, max_reserved_per_tdmr);
872 if (ret)
873 return ret;
874 }
875
876 return 0;
877 }
878
879 /*
880 * Construct a list of TDMRs on the preallocated space in @tdmr_list
881 * to cover all TDX memory regions in @tmb_list based on the TDX module
882 * TDMR global information in @sysinfo_tdmr.
883 */
construct_tdmrs(struct list_head * tmb_list,struct tdmr_info_list * tdmr_list,struct tdx_sys_info_tdmr * sysinfo_tdmr)884 static int construct_tdmrs(struct list_head *tmb_list,
885 struct tdmr_info_list *tdmr_list,
886 struct tdx_sys_info_tdmr *sysinfo_tdmr)
887 {
888 u16 pamt_entry_size[TDX_PS_NR] = {
889 sysinfo_tdmr->pamt_4k_entry_size,
890 sysinfo_tdmr->pamt_2m_entry_size,
891 sysinfo_tdmr->pamt_1g_entry_size,
892 };
893 int ret;
894
895 ret = fill_out_tdmrs(tmb_list, tdmr_list);
896 if (ret)
897 return ret;
898
899 ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size);
900 if (ret)
901 return ret;
902
903 ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
904 sysinfo_tdmr->max_reserved_per_tdmr);
905 if (ret)
906 tdmrs_free_pamt_all(tdmr_list);
907
908 /*
909 * The tdmr_info_list is read-only from here on out.
910 * Ensure that these writes are seen by other CPUs.
911 * Pairs with a smp_rmb() in is_pamt_page().
912 */
913 smp_wmb();
914
915 return ret;
916 }
917
config_tdx_module(struct tdmr_info_list * tdmr_list,u64 global_keyid)918 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
919 {
920 struct tdx_module_args args = {};
921 u64 *tdmr_pa_array;
922 size_t array_sz;
923 int i, ret;
924
925 /*
926 * TDMRs are passed to the TDX module via an array of physical
927 * addresses of each TDMR. The array itself also has certain
928 * alignment requirement.
929 */
930 array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
931 array_sz = roundup_pow_of_two(array_sz);
932 if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
933 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
934
935 tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
936 if (!tdmr_pa_array)
937 return -ENOMEM;
938
939 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
940 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
941
942 args.rcx = __pa(tdmr_pa_array);
943 args.rdx = tdmr_list->nr_consumed_tdmrs;
944 args.r8 = global_keyid;
945 ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
946
947 /* Free the array as it is not required anymore. */
948 kfree(tdmr_pa_array);
949
950 return ret;
951 }
952
do_global_key_config(void * unused)953 static int do_global_key_config(void *unused)
954 {
955 struct tdx_module_args args = {};
956
957 return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
958 }
959
960 /*
961 * Attempt to configure the global KeyID on all physical packages.
962 *
963 * This requires running code on at least one CPU in each package.
964 * TDMR initialization) will fail will fail if any package in the
965 * system has no online CPUs.
966 *
967 * This code takes no affirmative steps to online CPUs. Callers (aka.
968 * KVM) can ensure success by ensuring sufficient CPUs are online and
969 * can run SEAMCALLs.
970 */
config_global_keyid(void)971 static int config_global_keyid(void)
972 {
973 cpumask_var_t packages;
974 int cpu, ret = -EINVAL;
975
976 if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
977 return -ENOMEM;
978
979 /*
980 * Hardware doesn't guarantee cache coherency across different
981 * KeyIDs. The kernel needs to flush PAMT's dirty cachelines
982 * (associated with KeyID 0) before the TDX module can use the
983 * global KeyID to access the PAMT. Given PAMTs are potentially
984 * large (~1/256th of system RAM), just use WBINVD.
985 */
986 wbinvd_on_all_cpus();
987
988 for_each_online_cpu(cpu) {
989 /*
990 * The key configuration only needs to be done once per
991 * package and will return an error if configured more
992 * than once. Avoid doing it multiple times per package.
993 */
994 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
995 packages))
996 continue;
997
998 /*
999 * TDH.SYS.KEY.CONFIG cannot run concurrently on
1000 * different cpus. Do it one by one.
1001 */
1002 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1003 if (ret)
1004 break;
1005 }
1006
1007 free_cpumask_var(packages);
1008 return ret;
1009 }
1010
init_tdmr(struct tdmr_info * tdmr)1011 static int init_tdmr(struct tdmr_info *tdmr)
1012 {
1013 u64 next;
1014
1015 /*
1016 * Initializing a TDMR can be time consuming. To avoid long
1017 * SEAMCALLs, the TDX module may only initialize a part of the
1018 * TDMR in each call.
1019 */
1020 do {
1021 struct tdx_module_args args = {
1022 .rcx = tdmr->base,
1023 };
1024 int ret;
1025
1026 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1027 if (ret)
1028 return ret;
1029 /*
1030 * RDX contains 'next-to-initialize' address if
1031 * TDH.SYS.TDMR.INIT did not fully complete and
1032 * should be retried.
1033 */
1034 next = args.rdx;
1035 cond_resched();
1036 /* Keep making SEAMCALLs until the TDMR is done */
1037 } while (next < tdmr->base + tdmr->size);
1038
1039 return 0;
1040 }
1041
init_tdmrs(struct tdmr_info_list * tdmr_list)1042 static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1043 {
1044 int i;
1045
1046 /*
1047 * This operation is costly. It can be parallelized,
1048 * but keep it simple for now.
1049 */
1050 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1051 int ret;
1052
1053 ret = init_tdmr(tdmr_entry(tdmr_list, i));
1054 if (ret)
1055 return ret;
1056 }
1057
1058 return 0;
1059 }
1060
init_tdx_module(void)1061 static int init_tdx_module(void)
1062 {
1063 struct tdx_sys_info sysinfo;
1064 int ret;
1065
1066 ret = get_tdx_sys_info(&sysinfo);
1067 if (ret)
1068 return ret;
1069
1070 /* Check whether the kernel can support this module */
1071 ret = check_features(&sysinfo);
1072 if (ret)
1073 return ret;
1074
1075 /*
1076 * To keep things simple, assume that all TDX-protected memory
1077 * will come from the page allocator. Make sure all pages in the
1078 * page allocator are TDX-usable memory.
1079 *
1080 * Build the list of "TDX-usable" memory regions which cover all
1081 * pages in the page allocator to guarantee that. Do it while
1082 * holding mem_hotplug_lock read-lock as the memory hotplug code
1083 * path reads the @tdx_memlist to reject any new memory.
1084 */
1085 get_online_mems();
1086
1087 ret = build_tdx_memlist(&tdx_memlist);
1088 if (ret)
1089 goto out_put_tdxmem;
1090
1091 /* Allocate enough space for constructing TDMRs */
1092 ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo.tdmr);
1093 if (ret)
1094 goto err_free_tdxmem;
1095
1096 /* Cover all TDX-usable memory regions in TDMRs */
1097 ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo.tdmr);
1098 if (ret)
1099 goto err_free_tdmrs;
1100
1101 /* Pass the TDMRs and the global KeyID to the TDX module */
1102 ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1103 if (ret)
1104 goto err_free_pamts;
1105
1106 /* Config the key of global KeyID on all packages */
1107 ret = config_global_keyid();
1108 if (ret)
1109 goto err_reset_pamts;
1110
1111 /* Initialize TDMRs to complete the TDX module initialization */
1112 ret = init_tdmrs(&tdx_tdmr_list);
1113 if (ret)
1114 goto err_reset_pamts;
1115
1116 pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1117
1118 out_put_tdxmem:
1119 /*
1120 * @tdx_memlist is written here and read at memory hotplug time.
1121 * Lock out memory hotplug code while building it.
1122 */
1123 put_online_mems();
1124 return ret;
1125
1126 err_reset_pamts:
1127 /*
1128 * Part of PAMTs may already have been initialized by the
1129 * TDX module. Flush cache before returning PAMTs back
1130 * to the kernel.
1131 */
1132 wbinvd_on_all_cpus();
1133 /*
1134 * According to the TDX hardware spec, if the platform
1135 * doesn't have the "partial write machine check"
1136 * erratum, any kernel read/write will never cause #MC
1137 * in kernel space, thus it's OK to not convert PAMTs
1138 * back to normal. But do the conversion anyway here
1139 * as suggested by the TDX spec.
1140 */
1141 tdmrs_reset_pamt_all(&tdx_tdmr_list);
1142 err_free_pamts:
1143 tdmrs_free_pamt_all(&tdx_tdmr_list);
1144 err_free_tdmrs:
1145 free_tdmr_list(&tdx_tdmr_list);
1146 err_free_tdxmem:
1147 free_tdx_memlist(&tdx_memlist);
1148 goto out_put_tdxmem;
1149 }
1150
__tdx_enable(void)1151 static int __tdx_enable(void)
1152 {
1153 int ret;
1154
1155 ret = init_tdx_module();
1156 if (ret) {
1157 pr_err("module initialization failed (%d)\n", ret);
1158 tdx_module_status = TDX_MODULE_ERROR;
1159 return ret;
1160 }
1161
1162 pr_info("module initialized\n");
1163 tdx_module_status = TDX_MODULE_INITIALIZED;
1164
1165 return 0;
1166 }
1167
1168 /**
1169 * tdx_enable - Enable TDX module to make it ready to run TDX guests
1170 *
1171 * This function assumes the caller has: 1) held read lock of CPU hotplug
1172 * lock to prevent any new cpu from becoming online; 2) done both VMXON
1173 * and tdx_cpu_enable() on all online cpus.
1174 *
1175 * This function requires there's at least one online cpu for each CPU
1176 * package to succeed.
1177 *
1178 * This function can be called in parallel by multiple callers.
1179 *
1180 * Return 0 if TDX is enabled successfully, otherwise error.
1181 */
tdx_enable(void)1182 int tdx_enable(void)
1183 {
1184 int ret;
1185
1186 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1187 return -ENODEV;
1188
1189 lockdep_assert_cpus_held();
1190
1191 mutex_lock(&tdx_module_lock);
1192
1193 switch (tdx_module_status) {
1194 case TDX_MODULE_UNINITIALIZED:
1195 ret = __tdx_enable();
1196 break;
1197 case TDX_MODULE_INITIALIZED:
1198 /* Already initialized, great, tell the caller. */
1199 ret = 0;
1200 break;
1201 default:
1202 /* Failed to initialize in the previous attempts */
1203 ret = -EINVAL;
1204 break;
1205 }
1206
1207 mutex_unlock(&tdx_module_lock);
1208
1209 return ret;
1210 }
1211 EXPORT_SYMBOL_GPL(tdx_enable);
1212
is_pamt_page(unsigned long phys)1213 static bool is_pamt_page(unsigned long phys)
1214 {
1215 struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1216 int i;
1217
1218 /* Ensure that all remote 'tdmr_list' writes are visible: */
1219 smp_rmb();
1220
1221 /*
1222 * The TDX module is no longer returning TDX_SYS_NOT_READY and
1223 * is initialized. The 'tdmr_list' was initialized long ago
1224 * and is now read-only.
1225 */
1226 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1227 unsigned long base, size;
1228
1229 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1230
1231 if (phys >= base && phys < (base + size))
1232 return true;
1233 }
1234
1235 return false;
1236 }
1237
1238 /*
1239 * Return whether the memory page at the given physical address is TDX
1240 * private memory or not.
1241 *
1242 * This can be imprecise for two known reasons:
1243 * 1. PAMTs are private memory and exist before the TDX module is
1244 * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively
1245 * short window that occurs once per boot.
1246 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1247 * page. However, the page can still cause #MC until it has been
1248 * fully converted to shared using 64-byte writes like MOVDIR64B.
1249 * Buggy hosts might still leave #MC-causing memory in place which
1250 * this function can not detect.
1251 */
paddr_is_tdx_private(unsigned long phys)1252 static bool paddr_is_tdx_private(unsigned long phys)
1253 {
1254 struct tdx_module_args args = {
1255 .rcx = phys & PAGE_MASK,
1256 };
1257 u64 sret;
1258
1259 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1260 return false;
1261
1262 /* Get page type from the TDX module */
1263 sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
1264
1265 /*
1266 * The SEAMCALL will not return success unless there is a
1267 * working, "ready" TDX module. Assume an absence of TDX
1268 * private pages until SEAMCALL is working.
1269 */
1270 if (sret)
1271 return false;
1272
1273 /*
1274 * SEAMCALL was successful -- read page type (via RCX):
1275 *
1276 * - PT_NDA: Page is not used by the TDX module
1277 * - PT_RSVD: Reserved for Non-TDX use
1278 * - Others: Page is used by the TDX module
1279 *
1280 * Note PAMT pages are marked as PT_RSVD but they are also TDX
1281 * private memory.
1282 */
1283 switch (args.rcx) {
1284 case PT_NDA:
1285 return false;
1286 case PT_RSVD:
1287 return is_pamt_page(phys);
1288 default:
1289 return true;
1290 }
1291 }
1292
1293 /*
1294 * Some TDX-capable CPUs have an erratum. A write to TDX private
1295 * memory poisons that memory, and a subsequent read of that memory
1296 * triggers #MC.
1297 *
1298 * Help distinguish erratum-triggered #MCs from a normal hardware one.
1299 * Just print additional message to show such #MC may be result of the
1300 * erratum.
1301 */
tdx_dump_mce_info(struct mce * m)1302 const char *tdx_dump_mce_info(struct mce *m)
1303 {
1304 if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1305 return NULL;
1306
1307 if (!paddr_is_tdx_private(m->addr))
1308 return NULL;
1309
1310 return "TDX private memory error. Possible kernel bug.";
1311 }
1312
record_keyid_partitioning(u32 * tdx_keyid_start,u32 * nr_tdx_keyids)1313 static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1314 u32 *nr_tdx_keyids)
1315 {
1316 u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1317 int ret;
1318
1319 /*
1320 * IA32_MKTME_KEYID_PARTIONING:
1321 * Bit [31:0]: Number of MKTME KeyIDs.
1322 * Bit [63:32]: Number of TDX private KeyIDs.
1323 */
1324 ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1325 &_nr_tdx_keyids);
1326 if (ret || !_nr_tdx_keyids)
1327 return -EINVAL;
1328
1329 /* TDX KeyIDs start after the last MKTME KeyID. */
1330 _tdx_keyid_start = _nr_mktme_keyids + 1;
1331
1332 *tdx_keyid_start = _tdx_keyid_start;
1333 *nr_tdx_keyids = _nr_tdx_keyids;
1334
1335 return 0;
1336 }
1337
is_tdx_memory(unsigned long start_pfn,unsigned long end_pfn)1338 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1339 {
1340 struct tdx_memblock *tmb;
1341
1342 /*
1343 * This check assumes that the start_pfn<->end_pfn range does not
1344 * cross multiple @tdx_memlist entries. A single memory online
1345 * event across multiple memblocks (from which @tdx_memlist
1346 * entries are derived at the time of module initialization) is
1347 * not possible. This is because memory offline/online is done
1348 * on granularity of 'struct memory_block', and the hotpluggable
1349 * memory region (one memblock) must be multiple of memory_block.
1350 */
1351 list_for_each_entry(tmb, &tdx_memlist, list) {
1352 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1353 return true;
1354 }
1355 return false;
1356 }
1357
tdx_memory_notifier(struct notifier_block * nb,unsigned long action,void * v)1358 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1359 void *v)
1360 {
1361 struct memory_notify *mn = v;
1362
1363 if (action != MEM_GOING_ONLINE)
1364 return NOTIFY_OK;
1365
1366 /*
1367 * Empty list means TDX isn't enabled. Allow any memory
1368 * to go online.
1369 */
1370 if (list_empty(&tdx_memlist))
1371 return NOTIFY_OK;
1372
1373 /*
1374 * The TDX memory configuration is static and can not be
1375 * changed. Reject onlining any memory which is outside of
1376 * the static configuration whether it supports TDX or not.
1377 */
1378 if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1379 return NOTIFY_OK;
1380
1381 return NOTIFY_BAD;
1382 }
1383
1384 static struct notifier_block tdx_memory_nb = {
1385 .notifier_call = tdx_memory_notifier,
1386 };
1387
check_tdx_erratum(void)1388 static void __init check_tdx_erratum(void)
1389 {
1390 /*
1391 * These CPUs have an erratum. A partial write from non-TD
1392 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1393 * private memory poisons that memory, and a subsequent read of
1394 * that memory triggers #MC.
1395 */
1396 switch (boot_cpu_data.x86_vfm) {
1397 case INTEL_SAPPHIRERAPIDS_X:
1398 case INTEL_EMERALDRAPIDS_X:
1399 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1400 }
1401 }
1402
tdx_init(void)1403 void __init tdx_init(void)
1404 {
1405 u32 tdx_keyid_start, nr_tdx_keyids;
1406 int err;
1407
1408 err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1409 if (err)
1410 return;
1411
1412 pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1413 tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1414
1415 /*
1416 * The TDX module itself requires one 'global KeyID' to protect
1417 * its metadata. If there's only one TDX KeyID, there won't be
1418 * any left for TDX guests thus there's no point to enable TDX
1419 * at all.
1420 */
1421 if (nr_tdx_keyids < 2) {
1422 pr_err("initialization failed: too few private KeyIDs available.\n");
1423 return;
1424 }
1425
1426 /*
1427 * At this point, hibernation_available() indicates whether or
1428 * not hibernation support has been permanently disabled.
1429 */
1430 if (hibernation_available()) {
1431 pr_err("initialization failed: Hibernation support is enabled\n");
1432 return;
1433 }
1434
1435 err = register_memory_notifier(&tdx_memory_nb);
1436 if (err) {
1437 pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1438 err);
1439 return;
1440 }
1441
1442 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1443 pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1444 acpi_suspend_lowlevel = NULL;
1445 #endif
1446
1447 /*
1448 * Just use the first TDX KeyID as the 'global KeyID' and
1449 * leave the rest for TDX guests.
1450 */
1451 tdx_global_keyid = tdx_keyid_start;
1452 tdx_guest_keyid_start = tdx_keyid_start + 1;
1453 tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1454
1455 setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1456
1457 check_tdx_erratum();
1458 }
1459