1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright(c) 2023 Intel Corporation.
4  *
5  * Intel Trusted Domain Extensions (TDX) support
6  */
7 
8 #define pr_fmt(fmt)	"virt/tdx: " fmt
9 
10 #include <linux/types.h>
11 #include <linux/cache.h>
12 #include <linux/init.h>
13 #include <linux/errno.h>
14 #include <linux/printk.h>
15 #include <linux/cpu.h>
16 #include <linux/spinlock.h>
17 #include <linux/percpu-defs.h>
18 #include <linux/mutex.h>
19 #include <linux/list.h>
20 #include <linux/memblock.h>
21 #include <linux/memory.h>
22 #include <linux/minmax.h>
23 #include <linux/sizes.h>
24 #include <linux/pfn.h>
25 #include <linux/align.h>
26 #include <linux/sort.h>
27 #include <linux/log2.h>
28 #include <linux/acpi.h>
29 #include <linux/suspend.h>
30 #include <asm/page.h>
31 #include <asm/special_insns.h>
32 #include <asm/msr-index.h>
33 #include <asm/msr.h>
34 #include <asm/cpufeature.h>
35 #include <asm/tdx.h>
36 #include <asm/cpu_device_id.h>
37 #include <asm/processor.h>
38 #include <asm/mce.h>
39 #include "tdx.h"
40 
41 static u32 tdx_global_keyid __ro_after_init;
42 static u32 tdx_guest_keyid_start __ro_after_init;
43 static u32 tdx_nr_guest_keyids __ro_after_init;
44 
45 static DEFINE_PER_CPU(bool, tdx_lp_initialized);
46 
47 static struct tdmr_info_list tdx_tdmr_list;
48 
49 static enum tdx_module_status_t tdx_module_status;
50 static DEFINE_MUTEX(tdx_module_lock);
51 
52 /* All TDX-usable memory regions.  Protected by mem_hotplug_lock. */
53 static LIST_HEAD(tdx_memlist);
54 
55 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
56 
seamcall_err(u64 fn,u64 err,struct tdx_module_args * args)57 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
58 {
59 	pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
60 }
61 
seamcall_err_ret(u64 fn,u64 err,struct tdx_module_args * args)62 static inline void seamcall_err_ret(u64 fn, u64 err,
63 				    struct tdx_module_args *args)
64 {
65 	seamcall_err(fn, err, args);
66 	pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
67 			args->rcx, args->rdx, args->r8);
68 	pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
69 			args->r9, args->r10, args->r11);
70 }
71 
sc_retry_prerr(sc_func_t func,sc_err_func_t err_func,u64 fn,struct tdx_module_args * args)72 static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
73 				 u64 fn, struct tdx_module_args *args)
74 {
75 	u64 sret = sc_retry(func, fn, args);
76 
77 	if (sret == TDX_SUCCESS)
78 		return 0;
79 
80 	if (sret == TDX_SEAMCALL_VMFAILINVALID)
81 		return -ENODEV;
82 
83 	if (sret == TDX_SEAMCALL_GP)
84 		return -EOPNOTSUPP;
85 
86 	if (sret == TDX_SEAMCALL_UD)
87 		return -EACCES;
88 
89 	err_func(fn, sret, args);
90 	return -EIO;
91 }
92 
93 #define seamcall_prerr(__fn, __args)						\
94 	sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
95 
96 #define seamcall_prerr_ret(__fn, __args)					\
97 	sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
98 
99 /*
100  * Do the module global initialization once and return its result.
101  * It can be done on any cpu.  It's always called with interrupts
102  * disabled.
103  */
try_init_module_global(void)104 static int try_init_module_global(void)
105 {
106 	struct tdx_module_args args = {};
107 	static DEFINE_RAW_SPINLOCK(sysinit_lock);
108 	static bool sysinit_done;
109 	static int sysinit_ret;
110 
111 	lockdep_assert_irqs_disabled();
112 
113 	raw_spin_lock(&sysinit_lock);
114 
115 	if (sysinit_done)
116 		goto out;
117 
118 	/* RCX is module attributes and all bits are reserved */
119 	args.rcx = 0;
120 	sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
121 
122 	/*
123 	 * The first SEAMCALL also detects the TDX module, thus
124 	 * it can fail due to the TDX module is not loaded.
125 	 * Dump message to let the user know.
126 	 */
127 	if (sysinit_ret == -ENODEV)
128 		pr_err("module not loaded\n");
129 
130 	sysinit_done = true;
131 out:
132 	raw_spin_unlock(&sysinit_lock);
133 	return sysinit_ret;
134 }
135 
136 /**
137  * tdx_cpu_enable - Enable TDX on local cpu
138  *
139  * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
140  * global initialization SEAMCALL if not done) on local cpu to make this
141  * cpu be ready to run any other SEAMCALLs.
142  *
143  * Always call this function via IPI function calls.
144  *
145  * Return 0 on success, otherwise errors.
146  */
tdx_cpu_enable(void)147 int tdx_cpu_enable(void)
148 {
149 	struct tdx_module_args args = {};
150 	int ret;
151 
152 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
153 		return -ENODEV;
154 
155 	lockdep_assert_irqs_disabled();
156 
157 	if (__this_cpu_read(tdx_lp_initialized))
158 		return 0;
159 
160 	/*
161 	 * The TDX module global initialization is the very first step
162 	 * to enable TDX.  Need to do it first (if hasn't been done)
163 	 * before the per-cpu initialization.
164 	 */
165 	ret = try_init_module_global();
166 	if (ret)
167 		return ret;
168 
169 	ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
170 	if (ret)
171 		return ret;
172 
173 	__this_cpu_write(tdx_lp_initialized, true);
174 
175 	return 0;
176 }
177 EXPORT_SYMBOL_GPL(tdx_cpu_enable);
178 
179 /*
180  * Add a memory region as a TDX memory block.  The caller must make sure
181  * all memory regions are added in address ascending order and don't
182  * overlap.
183  */
add_tdx_memblock(struct list_head * tmb_list,unsigned long start_pfn,unsigned long end_pfn,int nid)184 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
185 			    unsigned long end_pfn, int nid)
186 {
187 	struct tdx_memblock *tmb;
188 
189 	tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
190 	if (!tmb)
191 		return -ENOMEM;
192 
193 	INIT_LIST_HEAD(&tmb->list);
194 	tmb->start_pfn = start_pfn;
195 	tmb->end_pfn = end_pfn;
196 	tmb->nid = nid;
197 
198 	/* @tmb_list is protected by mem_hotplug_lock */
199 	list_add_tail(&tmb->list, tmb_list);
200 	return 0;
201 }
202 
free_tdx_memlist(struct list_head * tmb_list)203 static void free_tdx_memlist(struct list_head *tmb_list)
204 {
205 	/* @tmb_list is protected by mem_hotplug_lock */
206 	while (!list_empty(tmb_list)) {
207 		struct tdx_memblock *tmb = list_first_entry(tmb_list,
208 				struct tdx_memblock, list);
209 
210 		list_del(&tmb->list);
211 		kfree(tmb);
212 	}
213 }
214 
215 /*
216  * Ensure that all memblock memory regions are convertible to TDX
217  * memory.  Once this has been established, stash the memblock
218  * ranges off in a secondary structure because memblock is modified
219  * in memory hotplug while TDX memory regions are fixed.
220  */
build_tdx_memlist(struct list_head * tmb_list)221 static int build_tdx_memlist(struct list_head *tmb_list)
222 {
223 	unsigned long start_pfn, end_pfn;
224 	int i, nid, ret;
225 
226 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
227 		/*
228 		 * The first 1MB is not reported as TDX convertible memory.
229 		 * Although the first 1MB is always reserved and won't end up
230 		 * to the page allocator, it is still in memblock's memory
231 		 * regions.  Skip them manually to exclude them as TDX memory.
232 		 */
233 		start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
234 		if (start_pfn >= end_pfn)
235 			continue;
236 
237 		/*
238 		 * Add the memory regions as TDX memory.  The regions in
239 		 * memblock has already guaranteed they are in address
240 		 * ascending order and don't overlap.
241 		 */
242 		ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
243 		if (ret)
244 			goto err;
245 	}
246 
247 	return 0;
248 err:
249 	free_tdx_memlist(tmb_list);
250 	return ret;
251 }
252 
read_sys_metadata_field(u64 field_id,u64 * data)253 static int read_sys_metadata_field(u64 field_id, u64 *data)
254 {
255 	struct tdx_module_args args = {};
256 	int ret;
257 
258 	/*
259 	 * TDH.SYS.RD -- reads one global metadata field
260 	 *  - RDX (in): the field to read
261 	 *  - R8 (out): the field data
262 	 */
263 	args.rdx = field_id;
264 	ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
265 	if (ret)
266 		return ret;
267 
268 	*data = args.r8;
269 
270 	return 0;
271 }
272 
273 #include "tdx_global_metadata.c"
274 
check_features(struct tdx_sys_info * sysinfo)275 static int check_features(struct tdx_sys_info *sysinfo)
276 {
277 	u64 tdx_features0 = sysinfo->features.tdx_features0;
278 
279 	if (!(tdx_features0 & TDX_FEATURES0_NO_RBP_MOD)) {
280 		pr_err("frame pointer (RBP) clobber bug present, upgrade TDX module\n");
281 		return -EINVAL;
282 	}
283 
284 	return 0;
285 }
286 
287 /* Calculate the actual TDMR size */
tdmr_size_single(u16 max_reserved_per_tdmr)288 static int tdmr_size_single(u16 max_reserved_per_tdmr)
289 {
290 	int tdmr_sz;
291 
292 	/*
293 	 * The actual size of TDMR depends on the maximum
294 	 * number of reserved areas.
295 	 */
296 	tdmr_sz = sizeof(struct tdmr_info);
297 	tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
298 
299 	return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
300 }
301 
alloc_tdmr_list(struct tdmr_info_list * tdmr_list,struct tdx_sys_info_tdmr * sysinfo_tdmr)302 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
303 			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
304 {
305 	size_t tdmr_sz, tdmr_array_sz;
306 	void *tdmr_array;
307 
308 	tdmr_sz = tdmr_size_single(sysinfo_tdmr->max_reserved_per_tdmr);
309 	tdmr_array_sz = tdmr_sz * sysinfo_tdmr->max_tdmrs;
310 
311 	/*
312 	 * To keep things simple, allocate all TDMRs together.
313 	 * The buffer needs to be physically contiguous to make
314 	 * sure each TDMR is physically contiguous.
315 	 */
316 	tdmr_array = alloc_pages_exact(tdmr_array_sz,
317 			GFP_KERNEL | __GFP_ZERO);
318 	if (!tdmr_array)
319 		return -ENOMEM;
320 
321 	tdmr_list->tdmrs = tdmr_array;
322 
323 	/*
324 	 * Keep the size of TDMR to find the target TDMR
325 	 * at a given index in the TDMR list.
326 	 */
327 	tdmr_list->tdmr_sz = tdmr_sz;
328 	tdmr_list->max_tdmrs = sysinfo_tdmr->max_tdmrs;
329 	tdmr_list->nr_consumed_tdmrs = 0;
330 
331 	return 0;
332 }
333 
free_tdmr_list(struct tdmr_info_list * tdmr_list)334 static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
335 {
336 	free_pages_exact(tdmr_list->tdmrs,
337 			tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
338 }
339 
340 /* Get the TDMR from the list at the given index. */
tdmr_entry(struct tdmr_info_list * tdmr_list,int idx)341 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
342 				    int idx)
343 {
344 	int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
345 
346 	return (void *)tdmr_list->tdmrs + tdmr_info_offset;
347 }
348 
349 #define TDMR_ALIGNMENT		SZ_1G
350 #define TDMR_ALIGN_DOWN(_addr)	ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
351 #define TDMR_ALIGN_UP(_addr)	ALIGN((_addr), TDMR_ALIGNMENT)
352 
tdmr_end(struct tdmr_info * tdmr)353 static inline u64 tdmr_end(struct tdmr_info *tdmr)
354 {
355 	return tdmr->base + tdmr->size;
356 }
357 
358 /*
359  * Take the memory referenced in @tmb_list and populate the
360  * preallocated @tdmr_list, following all the special alignment
361  * and size rules for TDMR.
362  */
fill_out_tdmrs(struct list_head * tmb_list,struct tdmr_info_list * tdmr_list)363 static int fill_out_tdmrs(struct list_head *tmb_list,
364 			  struct tdmr_info_list *tdmr_list)
365 {
366 	struct tdx_memblock *tmb;
367 	int tdmr_idx = 0;
368 
369 	/*
370 	 * Loop over TDX memory regions and fill out TDMRs to cover them.
371 	 * To keep it simple, always try to use one TDMR to cover one
372 	 * memory region.
373 	 *
374 	 * In practice TDX supports at least 64 TDMRs.  A 2-socket system
375 	 * typically only consumes less than 10 of those.  This code is
376 	 * dumb and simple and may use more TMDRs than is strictly
377 	 * required.
378 	 */
379 	list_for_each_entry(tmb, tmb_list, list) {
380 		struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
381 		u64 start, end;
382 
383 		start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
384 		end   = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
385 
386 		/*
387 		 * A valid size indicates the current TDMR has already
388 		 * been filled out to cover the previous memory region(s).
389 		 */
390 		if (tdmr->size) {
391 			/*
392 			 * Loop to the next if the current memory region
393 			 * has already been fully covered.
394 			 */
395 			if (end <= tdmr_end(tdmr))
396 				continue;
397 
398 			/* Otherwise, skip the already covered part. */
399 			if (start < tdmr_end(tdmr))
400 				start = tdmr_end(tdmr);
401 
402 			/*
403 			 * Create a new TDMR to cover the current memory
404 			 * region, or the remaining part of it.
405 			 */
406 			tdmr_idx++;
407 			if (tdmr_idx >= tdmr_list->max_tdmrs) {
408 				pr_warn("initialization failed: TDMRs exhausted.\n");
409 				return -ENOSPC;
410 			}
411 
412 			tdmr = tdmr_entry(tdmr_list, tdmr_idx);
413 		}
414 
415 		tdmr->base = start;
416 		tdmr->size = end - start;
417 	}
418 
419 	/* @tdmr_idx is always the index of the last valid TDMR. */
420 	tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
421 
422 	/*
423 	 * Warn early that kernel is about to run out of TDMRs.
424 	 *
425 	 * This is an indication that TDMR allocation has to be
426 	 * reworked to be smarter to not run into an issue.
427 	 */
428 	if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
429 		pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
430 				tdmr_list->nr_consumed_tdmrs,
431 				tdmr_list->max_tdmrs);
432 
433 	return 0;
434 }
435 
436 /*
437  * Calculate PAMT size given a TDMR and a page size.  The returned
438  * PAMT size is always aligned up to 4K page boundary.
439  */
tdmr_get_pamt_sz(struct tdmr_info * tdmr,int pgsz,u16 pamt_entry_size)440 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
441 				      u16 pamt_entry_size)
442 {
443 	unsigned long pamt_sz, nr_pamt_entries;
444 
445 	switch (pgsz) {
446 	case TDX_PS_4K:
447 		nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
448 		break;
449 	case TDX_PS_2M:
450 		nr_pamt_entries = tdmr->size >> PMD_SHIFT;
451 		break;
452 	case TDX_PS_1G:
453 		nr_pamt_entries = tdmr->size >> PUD_SHIFT;
454 		break;
455 	default:
456 		WARN_ON_ONCE(1);
457 		return 0;
458 	}
459 
460 	pamt_sz = nr_pamt_entries * pamt_entry_size;
461 	/* TDX requires PAMT size must be 4K aligned */
462 	pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
463 
464 	return pamt_sz;
465 }
466 
467 /*
468  * Locate a NUMA node which should hold the allocation of the @tdmr
469  * PAMT.  This node will have some memory covered by the TDMR.  The
470  * relative amount of memory covered is not considered.
471  */
tdmr_get_nid(struct tdmr_info * tdmr,struct list_head * tmb_list)472 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
473 {
474 	struct tdx_memblock *tmb;
475 
476 	/*
477 	 * A TDMR must cover at least part of one TMB.  That TMB will end
478 	 * after the TDMR begins.  But, that TMB may have started before
479 	 * the TDMR.  Find the next 'tmb' that _ends_ after this TDMR
480 	 * begins.  Ignore 'tmb' start addresses.  They are irrelevant.
481 	 */
482 	list_for_each_entry(tmb, tmb_list, list) {
483 		if (tmb->end_pfn > PHYS_PFN(tdmr->base))
484 			return tmb->nid;
485 	}
486 
487 	/*
488 	 * Fall back to allocating the TDMR's metadata from node 0 when
489 	 * no TDX memory block can be found.  This should never happen
490 	 * since TDMRs originate from TDX memory blocks.
491 	 */
492 	pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
493 			tdmr->base, tdmr_end(tdmr));
494 	return 0;
495 }
496 
497 /*
498  * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
499  * within @tdmr, and set up PAMTs for @tdmr.
500  */
tdmr_set_up_pamt(struct tdmr_info * tdmr,struct list_head * tmb_list,u16 pamt_entry_size[])501 static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
502 			    struct list_head *tmb_list,
503 			    u16 pamt_entry_size[])
504 {
505 	unsigned long pamt_base[TDX_PS_NR];
506 	unsigned long pamt_size[TDX_PS_NR];
507 	unsigned long tdmr_pamt_base;
508 	unsigned long tdmr_pamt_size;
509 	struct page *pamt;
510 	int pgsz, nid;
511 
512 	nid = tdmr_get_nid(tdmr, tmb_list);
513 
514 	/*
515 	 * Calculate the PAMT size for each TDX supported page size
516 	 * and the total PAMT size.
517 	 */
518 	tdmr_pamt_size = 0;
519 	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
520 		pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
521 					pamt_entry_size[pgsz]);
522 		tdmr_pamt_size += pamt_size[pgsz];
523 	}
524 
525 	/*
526 	 * Allocate one chunk of physically contiguous memory for all
527 	 * PAMTs.  This helps minimize the PAMT's use of reserved areas
528 	 * in overlapped TDMRs.
529 	 */
530 	pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
531 			nid, &node_online_map);
532 	if (!pamt)
533 		return -ENOMEM;
534 
535 	/*
536 	 * Break the contiguous allocation back up into the
537 	 * individual PAMTs for each page size.
538 	 */
539 	tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
540 	for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
541 		pamt_base[pgsz] = tdmr_pamt_base;
542 		tdmr_pamt_base += pamt_size[pgsz];
543 	}
544 
545 	tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
546 	tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
547 	tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
548 	tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
549 	tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
550 	tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
551 
552 	return 0;
553 }
554 
tdmr_get_pamt(struct tdmr_info * tdmr,unsigned long * pamt_base,unsigned long * pamt_size)555 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
556 			  unsigned long *pamt_size)
557 {
558 	unsigned long pamt_bs, pamt_sz;
559 
560 	/*
561 	 * The PAMT was allocated in one contiguous unit.  The 4K PAMT
562 	 * should always point to the beginning of that allocation.
563 	 */
564 	pamt_bs = tdmr->pamt_4k_base;
565 	pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
566 
567 	WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
568 
569 	*pamt_base = pamt_bs;
570 	*pamt_size = pamt_sz;
571 }
572 
tdmr_do_pamt_func(struct tdmr_info * tdmr,void (* pamt_func)(unsigned long base,unsigned long size))573 static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
574 		void (*pamt_func)(unsigned long base, unsigned long size))
575 {
576 	unsigned long pamt_base, pamt_size;
577 
578 	tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
579 
580 	/* Do nothing if PAMT hasn't been allocated for this TDMR */
581 	if (!pamt_size)
582 		return;
583 
584 	if (WARN_ON_ONCE(!pamt_base))
585 		return;
586 
587 	pamt_func(pamt_base, pamt_size);
588 }
589 
free_pamt(unsigned long pamt_base,unsigned long pamt_size)590 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
591 {
592 	free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
593 }
594 
tdmr_free_pamt(struct tdmr_info * tdmr)595 static void tdmr_free_pamt(struct tdmr_info *tdmr)
596 {
597 	tdmr_do_pamt_func(tdmr, free_pamt);
598 }
599 
tdmrs_free_pamt_all(struct tdmr_info_list * tdmr_list)600 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
601 {
602 	int i;
603 
604 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
605 		tdmr_free_pamt(tdmr_entry(tdmr_list, i));
606 }
607 
608 /* Allocate and set up PAMTs for all TDMRs */
tdmrs_set_up_pamt_all(struct tdmr_info_list * tdmr_list,struct list_head * tmb_list,u16 pamt_entry_size[])609 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
610 				 struct list_head *tmb_list,
611 				 u16 pamt_entry_size[])
612 {
613 	int i, ret = 0;
614 
615 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
616 		ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
617 				pamt_entry_size);
618 		if (ret)
619 			goto err;
620 	}
621 
622 	return 0;
623 err:
624 	tdmrs_free_pamt_all(tdmr_list);
625 	return ret;
626 }
627 
628 /*
629  * Convert TDX private pages back to normal by using MOVDIR64B to
630  * clear these pages.  Note this function doesn't flush cache of
631  * these TDX private pages.  The caller should make sure of that.
632  */
reset_tdx_pages(unsigned long base,unsigned long size)633 static void reset_tdx_pages(unsigned long base, unsigned long size)
634 {
635 	const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
636 	unsigned long phys, end;
637 
638 	end = base + size;
639 	for (phys = base; phys < end; phys += 64)
640 		movdir64b(__va(phys), zero_page);
641 
642 	/*
643 	 * MOVDIR64B uses WC protocol.  Use memory barrier to
644 	 * make sure any later user of these pages sees the
645 	 * updated data.
646 	 */
647 	mb();
648 }
649 
tdmr_reset_pamt(struct tdmr_info * tdmr)650 static void tdmr_reset_pamt(struct tdmr_info *tdmr)
651 {
652 	tdmr_do_pamt_func(tdmr, reset_tdx_pages);
653 }
654 
tdmrs_reset_pamt_all(struct tdmr_info_list * tdmr_list)655 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
656 {
657 	int i;
658 
659 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
660 		tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
661 }
662 
tdmrs_count_pamt_kb(struct tdmr_info_list * tdmr_list)663 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
664 {
665 	unsigned long pamt_size = 0;
666 	int i;
667 
668 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
669 		unsigned long base, size;
670 
671 		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
672 		pamt_size += size;
673 	}
674 
675 	return pamt_size / 1024;
676 }
677 
tdmr_add_rsvd_area(struct tdmr_info * tdmr,int * p_idx,u64 addr,u64 size,u16 max_reserved_per_tdmr)678 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
679 			      u64 size, u16 max_reserved_per_tdmr)
680 {
681 	struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
682 	int idx = *p_idx;
683 
684 	/* Reserved area must be 4K aligned in offset and size */
685 	if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
686 		return -EINVAL;
687 
688 	if (idx >= max_reserved_per_tdmr) {
689 		pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
690 				tdmr->base, tdmr_end(tdmr));
691 		return -ENOSPC;
692 	}
693 
694 	/*
695 	 * Consume one reserved area per call.  Make no effort to
696 	 * optimize or reduce the number of reserved areas which are
697 	 * consumed by contiguous reserved areas, for instance.
698 	 */
699 	rsvd_areas[idx].offset = addr - tdmr->base;
700 	rsvd_areas[idx].size = size;
701 
702 	*p_idx = idx + 1;
703 
704 	return 0;
705 }
706 
707 /*
708  * Go through @tmb_list to find holes between memory areas.  If any of
709  * those holes fall within @tdmr, set up a TDMR reserved area to cover
710  * the hole.
711  */
tdmr_populate_rsvd_holes(struct list_head * tmb_list,struct tdmr_info * tdmr,int * rsvd_idx,u16 max_reserved_per_tdmr)712 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
713 				    struct tdmr_info *tdmr,
714 				    int *rsvd_idx,
715 				    u16 max_reserved_per_tdmr)
716 {
717 	struct tdx_memblock *tmb;
718 	u64 prev_end;
719 	int ret;
720 
721 	/*
722 	 * Start looking for reserved blocks at the
723 	 * beginning of the TDMR.
724 	 */
725 	prev_end = tdmr->base;
726 	list_for_each_entry(tmb, tmb_list, list) {
727 		u64 start, end;
728 
729 		start = PFN_PHYS(tmb->start_pfn);
730 		end   = PFN_PHYS(tmb->end_pfn);
731 
732 		/* Break if this region is after the TDMR */
733 		if (start >= tdmr_end(tdmr))
734 			break;
735 
736 		/* Exclude regions before this TDMR */
737 		if (end < tdmr->base)
738 			continue;
739 
740 		/*
741 		 * Skip over memory areas that
742 		 * have already been dealt with.
743 		 */
744 		if (start <= prev_end) {
745 			prev_end = end;
746 			continue;
747 		}
748 
749 		/* Add the hole before this region */
750 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
751 				start - prev_end,
752 				max_reserved_per_tdmr);
753 		if (ret)
754 			return ret;
755 
756 		prev_end = end;
757 	}
758 
759 	/* Add the hole after the last region if it exists. */
760 	if (prev_end < tdmr_end(tdmr)) {
761 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
762 				tdmr_end(tdmr) - prev_end,
763 				max_reserved_per_tdmr);
764 		if (ret)
765 			return ret;
766 	}
767 
768 	return 0;
769 }
770 
771 /*
772  * Go through @tdmr_list to find all PAMTs.  If any of those PAMTs
773  * overlaps with @tdmr, set up a TDMR reserved area to cover the
774  * overlapping part.
775  */
tdmr_populate_rsvd_pamts(struct tdmr_info_list * tdmr_list,struct tdmr_info * tdmr,int * rsvd_idx,u16 max_reserved_per_tdmr)776 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
777 				    struct tdmr_info *tdmr,
778 				    int *rsvd_idx,
779 				    u16 max_reserved_per_tdmr)
780 {
781 	int i, ret;
782 
783 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
784 		struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
785 		unsigned long pamt_base, pamt_size, pamt_end;
786 
787 		tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
788 		/* Each TDMR must already have PAMT allocated */
789 		WARN_ON_ONCE(!pamt_size || !pamt_base);
790 
791 		pamt_end = pamt_base + pamt_size;
792 		/* Skip PAMTs outside of the given TDMR */
793 		if ((pamt_end <= tdmr->base) ||
794 				(pamt_base >= tdmr_end(tdmr)))
795 			continue;
796 
797 		/* Only mark the part within the TDMR as reserved */
798 		if (pamt_base < tdmr->base)
799 			pamt_base = tdmr->base;
800 		if (pamt_end > tdmr_end(tdmr))
801 			pamt_end = tdmr_end(tdmr);
802 
803 		ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
804 				pamt_end - pamt_base,
805 				max_reserved_per_tdmr);
806 		if (ret)
807 			return ret;
808 	}
809 
810 	return 0;
811 }
812 
813 /* Compare function called by sort() for TDMR reserved areas */
rsvd_area_cmp_func(const void * a,const void * b)814 static int rsvd_area_cmp_func(const void *a, const void *b)
815 {
816 	struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
817 	struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
818 
819 	if (r1->offset + r1->size <= r2->offset)
820 		return -1;
821 	if (r1->offset >= r2->offset + r2->size)
822 		return 1;
823 
824 	/* Reserved areas cannot overlap.  The caller must guarantee. */
825 	WARN_ON_ONCE(1);
826 	return -1;
827 }
828 
829 /*
830  * Populate reserved areas for the given @tdmr, including memory holes
831  * (via @tmb_list) and PAMTs (via @tdmr_list).
832  */
tdmr_populate_rsvd_areas(struct tdmr_info * tdmr,struct list_head * tmb_list,struct tdmr_info_list * tdmr_list,u16 max_reserved_per_tdmr)833 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
834 				    struct list_head *tmb_list,
835 				    struct tdmr_info_list *tdmr_list,
836 				    u16 max_reserved_per_tdmr)
837 {
838 	int ret, rsvd_idx = 0;
839 
840 	ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
841 			max_reserved_per_tdmr);
842 	if (ret)
843 		return ret;
844 
845 	ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
846 			max_reserved_per_tdmr);
847 	if (ret)
848 		return ret;
849 
850 	/* TDX requires reserved areas listed in address ascending order */
851 	sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
852 			rsvd_area_cmp_func, NULL);
853 
854 	return 0;
855 }
856 
857 /*
858  * Populate reserved areas for all TDMRs in @tdmr_list, including memory
859  * holes (via @tmb_list) and PAMTs.
860  */
tdmrs_populate_rsvd_areas_all(struct tdmr_info_list * tdmr_list,struct list_head * tmb_list,u16 max_reserved_per_tdmr)861 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
862 					 struct list_head *tmb_list,
863 					 u16 max_reserved_per_tdmr)
864 {
865 	int i;
866 
867 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
868 		int ret;
869 
870 		ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
871 				tmb_list, tdmr_list, max_reserved_per_tdmr);
872 		if (ret)
873 			return ret;
874 	}
875 
876 	return 0;
877 }
878 
879 /*
880  * Construct a list of TDMRs on the preallocated space in @tdmr_list
881  * to cover all TDX memory regions in @tmb_list based on the TDX module
882  * TDMR global information in @sysinfo_tdmr.
883  */
construct_tdmrs(struct list_head * tmb_list,struct tdmr_info_list * tdmr_list,struct tdx_sys_info_tdmr * sysinfo_tdmr)884 static int construct_tdmrs(struct list_head *tmb_list,
885 			   struct tdmr_info_list *tdmr_list,
886 			   struct tdx_sys_info_tdmr *sysinfo_tdmr)
887 {
888 	u16 pamt_entry_size[TDX_PS_NR] = {
889 		sysinfo_tdmr->pamt_4k_entry_size,
890 		sysinfo_tdmr->pamt_2m_entry_size,
891 		sysinfo_tdmr->pamt_1g_entry_size,
892 	};
893 	int ret;
894 
895 	ret = fill_out_tdmrs(tmb_list, tdmr_list);
896 	if (ret)
897 		return ret;
898 
899 	ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list, pamt_entry_size);
900 	if (ret)
901 		return ret;
902 
903 	ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
904 			sysinfo_tdmr->max_reserved_per_tdmr);
905 	if (ret)
906 		tdmrs_free_pamt_all(tdmr_list);
907 
908 	/*
909 	 * The tdmr_info_list is read-only from here on out.
910 	 * Ensure that these writes are seen by other CPUs.
911 	 * Pairs with a smp_rmb() in is_pamt_page().
912 	 */
913 	smp_wmb();
914 
915 	return ret;
916 }
917 
config_tdx_module(struct tdmr_info_list * tdmr_list,u64 global_keyid)918 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
919 {
920 	struct tdx_module_args args = {};
921 	u64 *tdmr_pa_array;
922 	size_t array_sz;
923 	int i, ret;
924 
925 	/*
926 	 * TDMRs are passed to the TDX module via an array of physical
927 	 * addresses of each TDMR.  The array itself also has certain
928 	 * alignment requirement.
929 	 */
930 	array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
931 	array_sz = roundup_pow_of_two(array_sz);
932 	if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
933 		array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
934 
935 	tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
936 	if (!tdmr_pa_array)
937 		return -ENOMEM;
938 
939 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
940 		tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
941 
942 	args.rcx = __pa(tdmr_pa_array);
943 	args.rdx = tdmr_list->nr_consumed_tdmrs;
944 	args.r8 = global_keyid;
945 	ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
946 
947 	/* Free the array as it is not required anymore. */
948 	kfree(tdmr_pa_array);
949 
950 	return ret;
951 }
952 
do_global_key_config(void * unused)953 static int do_global_key_config(void *unused)
954 {
955 	struct tdx_module_args args = {};
956 
957 	return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
958 }
959 
960 /*
961  * Attempt to configure the global KeyID on all physical packages.
962  *
963  * This requires running code on at least one CPU in each package.
964  * TDMR initialization) will fail will fail if any package in the
965  * system has no online CPUs.
966  *
967  * This code takes no affirmative steps to online CPUs.  Callers (aka.
968  * KVM) can ensure success by ensuring sufficient CPUs are online and
969  * can run SEAMCALLs.
970  */
config_global_keyid(void)971 static int config_global_keyid(void)
972 {
973 	cpumask_var_t packages;
974 	int cpu, ret = -EINVAL;
975 
976 	if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
977 		return -ENOMEM;
978 
979 	/*
980 	 * Hardware doesn't guarantee cache coherency across different
981 	 * KeyIDs.  The kernel needs to flush PAMT's dirty cachelines
982 	 * (associated with KeyID 0) before the TDX module can use the
983 	 * global KeyID to access the PAMT.  Given PAMTs are potentially
984 	 * large (~1/256th of system RAM), just use WBINVD.
985 	 */
986 	wbinvd_on_all_cpus();
987 
988 	for_each_online_cpu(cpu) {
989 		/*
990 		 * The key configuration only needs to be done once per
991 		 * package and will return an error if configured more
992 		 * than once.  Avoid doing it multiple times per package.
993 		 */
994 		if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
995 					packages))
996 			continue;
997 
998 		/*
999 		 * TDH.SYS.KEY.CONFIG cannot run concurrently on
1000 		 * different cpus.  Do it one by one.
1001 		 */
1002 		ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1003 		if (ret)
1004 			break;
1005 	}
1006 
1007 	free_cpumask_var(packages);
1008 	return ret;
1009 }
1010 
init_tdmr(struct tdmr_info * tdmr)1011 static int init_tdmr(struct tdmr_info *tdmr)
1012 {
1013 	u64 next;
1014 
1015 	/*
1016 	 * Initializing a TDMR can be time consuming.  To avoid long
1017 	 * SEAMCALLs, the TDX module may only initialize a part of the
1018 	 * TDMR in each call.
1019 	 */
1020 	do {
1021 		struct tdx_module_args args = {
1022 			.rcx = tdmr->base,
1023 		};
1024 		int ret;
1025 
1026 		ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1027 		if (ret)
1028 			return ret;
1029 		/*
1030 		 * RDX contains 'next-to-initialize' address if
1031 		 * TDH.SYS.TDMR.INIT did not fully complete and
1032 		 * should be retried.
1033 		 */
1034 		next = args.rdx;
1035 		cond_resched();
1036 		/* Keep making SEAMCALLs until the TDMR is done */
1037 	} while (next < tdmr->base + tdmr->size);
1038 
1039 	return 0;
1040 }
1041 
init_tdmrs(struct tdmr_info_list * tdmr_list)1042 static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1043 {
1044 	int i;
1045 
1046 	/*
1047 	 * This operation is costly.  It can be parallelized,
1048 	 * but keep it simple for now.
1049 	 */
1050 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1051 		int ret;
1052 
1053 		ret = init_tdmr(tdmr_entry(tdmr_list, i));
1054 		if (ret)
1055 			return ret;
1056 	}
1057 
1058 	return 0;
1059 }
1060 
init_tdx_module(void)1061 static int init_tdx_module(void)
1062 {
1063 	struct tdx_sys_info sysinfo;
1064 	int ret;
1065 
1066 	ret = get_tdx_sys_info(&sysinfo);
1067 	if (ret)
1068 		return ret;
1069 
1070 	/* Check whether the kernel can support this module */
1071 	ret = check_features(&sysinfo);
1072 	if (ret)
1073 		return ret;
1074 
1075 	/*
1076 	 * To keep things simple, assume that all TDX-protected memory
1077 	 * will come from the page allocator.  Make sure all pages in the
1078 	 * page allocator are TDX-usable memory.
1079 	 *
1080 	 * Build the list of "TDX-usable" memory regions which cover all
1081 	 * pages in the page allocator to guarantee that.  Do it while
1082 	 * holding mem_hotplug_lock read-lock as the memory hotplug code
1083 	 * path reads the @tdx_memlist to reject any new memory.
1084 	 */
1085 	get_online_mems();
1086 
1087 	ret = build_tdx_memlist(&tdx_memlist);
1088 	if (ret)
1089 		goto out_put_tdxmem;
1090 
1091 	/* Allocate enough space for constructing TDMRs */
1092 	ret = alloc_tdmr_list(&tdx_tdmr_list, &sysinfo.tdmr);
1093 	if (ret)
1094 		goto err_free_tdxmem;
1095 
1096 	/* Cover all TDX-usable memory regions in TDMRs */
1097 	ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &sysinfo.tdmr);
1098 	if (ret)
1099 		goto err_free_tdmrs;
1100 
1101 	/* Pass the TDMRs and the global KeyID to the TDX module */
1102 	ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1103 	if (ret)
1104 		goto err_free_pamts;
1105 
1106 	/* Config the key of global KeyID on all packages */
1107 	ret = config_global_keyid();
1108 	if (ret)
1109 		goto err_reset_pamts;
1110 
1111 	/* Initialize TDMRs to complete the TDX module initialization */
1112 	ret = init_tdmrs(&tdx_tdmr_list);
1113 	if (ret)
1114 		goto err_reset_pamts;
1115 
1116 	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1117 
1118 out_put_tdxmem:
1119 	/*
1120 	 * @tdx_memlist is written here and read at memory hotplug time.
1121 	 * Lock out memory hotplug code while building it.
1122 	 */
1123 	put_online_mems();
1124 	return ret;
1125 
1126 err_reset_pamts:
1127 	/*
1128 	 * Part of PAMTs may already have been initialized by the
1129 	 * TDX module.  Flush cache before returning PAMTs back
1130 	 * to the kernel.
1131 	 */
1132 	wbinvd_on_all_cpus();
1133 	/*
1134 	 * According to the TDX hardware spec, if the platform
1135 	 * doesn't have the "partial write machine check"
1136 	 * erratum, any kernel read/write will never cause #MC
1137 	 * in kernel space, thus it's OK to not convert PAMTs
1138 	 * back to normal.  But do the conversion anyway here
1139 	 * as suggested by the TDX spec.
1140 	 */
1141 	tdmrs_reset_pamt_all(&tdx_tdmr_list);
1142 err_free_pamts:
1143 	tdmrs_free_pamt_all(&tdx_tdmr_list);
1144 err_free_tdmrs:
1145 	free_tdmr_list(&tdx_tdmr_list);
1146 err_free_tdxmem:
1147 	free_tdx_memlist(&tdx_memlist);
1148 	goto out_put_tdxmem;
1149 }
1150 
__tdx_enable(void)1151 static int __tdx_enable(void)
1152 {
1153 	int ret;
1154 
1155 	ret = init_tdx_module();
1156 	if (ret) {
1157 		pr_err("module initialization failed (%d)\n", ret);
1158 		tdx_module_status = TDX_MODULE_ERROR;
1159 		return ret;
1160 	}
1161 
1162 	pr_info("module initialized\n");
1163 	tdx_module_status = TDX_MODULE_INITIALIZED;
1164 
1165 	return 0;
1166 }
1167 
1168 /**
1169  * tdx_enable - Enable TDX module to make it ready to run TDX guests
1170  *
1171  * This function assumes the caller has: 1) held read lock of CPU hotplug
1172  * lock to prevent any new cpu from becoming online; 2) done both VMXON
1173  * and tdx_cpu_enable() on all online cpus.
1174  *
1175  * This function requires there's at least one online cpu for each CPU
1176  * package to succeed.
1177  *
1178  * This function can be called in parallel by multiple callers.
1179  *
1180  * Return 0 if TDX is enabled successfully, otherwise error.
1181  */
tdx_enable(void)1182 int tdx_enable(void)
1183 {
1184 	int ret;
1185 
1186 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1187 		return -ENODEV;
1188 
1189 	lockdep_assert_cpus_held();
1190 
1191 	mutex_lock(&tdx_module_lock);
1192 
1193 	switch (tdx_module_status) {
1194 	case TDX_MODULE_UNINITIALIZED:
1195 		ret = __tdx_enable();
1196 		break;
1197 	case TDX_MODULE_INITIALIZED:
1198 		/* Already initialized, great, tell the caller. */
1199 		ret = 0;
1200 		break;
1201 	default:
1202 		/* Failed to initialize in the previous attempts */
1203 		ret = -EINVAL;
1204 		break;
1205 	}
1206 
1207 	mutex_unlock(&tdx_module_lock);
1208 
1209 	return ret;
1210 }
1211 EXPORT_SYMBOL_GPL(tdx_enable);
1212 
is_pamt_page(unsigned long phys)1213 static bool is_pamt_page(unsigned long phys)
1214 {
1215 	struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1216 	int i;
1217 
1218 	/* Ensure that all remote 'tdmr_list' writes are visible: */
1219 	smp_rmb();
1220 
1221 	/*
1222 	 * The TDX module is no longer returning TDX_SYS_NOT_READY and
1223 	 * is initialized.  The 'tdmr_list' was initialized long ago
1224 	 * and is now read-only.
1225 	 */
1226 	for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1227 		unsigned long base, size;
1228 
1229 		tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1230 
1231 		if (phys >= base && phys < (base + size))
1232 			return true;
1233 	}
1234 
1235 	return false;
1236 }
1237 
1238 /*
1239  * Return whether the memory page at the given physical address is TDX
1240  * private memory or not.
1241  *
1242  * This can be imprecise for two known reasons:
1243  * 1. PAMTs are private memory and exist before the TDX module is
1244  *    ready and TDH_PHYMEM_PAGE_RDMD works.  This is a relatively
1245  *    short window that occurs once per boot.
1246  * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1247  *    page.  However, the page can still cause #MC until it has been
1248  *    fully converted to shared using 64-byte writes like MOVDIR64B.
1249  *    Buggy hosts might still leave #MC-causing memory in place which
1250  *    this function can not detect.
1251  */
paddr_is_tdx_private(unsigned long phys)1252 static bool paddr_is_tdx_private(unsigned long phys)
1253 {
1254 	struct tdx_module_args args = {
1255 		.rcx = phys & PAGE_MASK,
1256 	};
1257 	u64 sret;
1258 
1259 	if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1260 		return false;
1261 
1262 	/* Get page type from the TDX module */
1263 	sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
1264 
1265 	/*
1266 	 * The SEAMCALL will not return success unless there is a
1267 	 * working, "ready" TDX module.  Assume an absence of TDX
1268 	 * private pages until SEAMCALL is working.
1269 	 */
1270 	if (sret)
1271 		return false;
1272 
1273 	/*
1274 	 * SEAMCALL was successful -- read page type (via RCX):
1275 	 *
1276 	 *  - PT_NDA:	Page is not used by the TDX module
1277 	 *  - PT_RSVD:	Reserved for Non-TDX use
1278 	 *  - Others:	Page is used by the TDX module
1279 	 *
1280 	 * Note PAMT pages are marked as PT_RSVD but they are also TDX
1281 	 * private memory.
1282 	 */
1283 	switch (args.rcx) {
1284 	case PT_NDA:
1285 		return false;
1286 	case PT_RSVD:
1287 		return is_pamt_page(phys);
1288 	default:
1289 		return true;
1290 	}
1291 }
1292 
1293 /*
1294  * Some TDX-capable CPUs have an erratum.  A write to TDX private
1295  * memory poisons that memory, and a subsequent read of that memory
1296  * triggers #MC.
1297  *
1298  * Help distinguish erratum-triggered #MCs from a normal hardware one.
1299  * Just print additional message to show such #MC may be result of the
1300  * erratum.
1301  */
tdx_dump_mce_info(struct mce * m)1302 const char *tdx_dump_mce_info(struct mce *m)
1303 {
1304 	if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1305 		return NULL;
1306 
1307 	if (!paddr_is_tdx_private(m->addr))
1308 		return NULL;
1309 
1310 	return "TDX private memory error. Possible kernel bug.";
1311 }
1312 
record_keyid_partitioning(u32 * tdx_keyid_start,u32 * nr_tdx_keyids)1313 static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1314 					    u32 *nr_tdx_keyids)
1315 {
1316 	u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1317 	int ret;
1318 
1319 	/*
1320 	 * IA32_MKTME_KEYID_PARTIONING:
1321 	 *   Bit [31:0]:	Number of MKTME KeyIDs.
1322 	 *   Bit [63:32]:	Number of TDX private KeyIDs.
1323 	 */
1324 	ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1325 			&_nr_tdx_keyids);
1326 	if (ret || !_nr_tdx_keyids)
1327 		return -EINVAL;
1328 
1329 	/* TDX KeyIDs start after the last MKTME KeyID. */
1330 	_tdx_keyid_start = _nr_mktme_keyids + 1;
1331 
1332 	*tdx_keyid_start = _tdx_keyid_start;
1333 	*nr_tdx_keyids = _nr_tdx_keyids;
1334 
1335 	return 0;
1336 }
1337 
is_tdx_memory(unsigned long start_pfn,unsigned long end_pfn)1338 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1339 {
1340 	struct tdx_memblock *tmb;
1341 
1342 	/*
1343 	 * This check assumes that the start_pfn<->end_pfn range does not
1344 	 * cross multiple @tdx_memlist entries.  A single memory online
1345 	 * event across multiple memblocks (from which @tdx_memlist
1346 	 * entries are derived at the time of module initialization) is
1347 	 * not possible.  This is because memory offline/online is done
1348 	 * on granularity of 'struct memory_block', and the hotpluggable
1349 	 * memory region (one memblock) must be multiple of memory_block.
1350 	 */
1351 	list_for_each_entry(tmb, &tdx_memlist, list) {
1352 		if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1353 			return true;
1354 	}
1355 	return false;
1356 }
1357 
tdx_memory_notifier(struct notifier_block * nb,unsigned long action,void * v)1358 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1359 			       void *v)
1360 {
1361 	struct memory_notify *mn = v;
1362 
1363 	if (action != MEM_GOING_ONLINE)
1364 		return NOTIFY_OK;
1365 
1366 	/*
1367 	 * Empty list means TDX isn't enabled.  Allow any memory
1368 	 * to go online.
1369 	 */
1370 	if (list_empty(&tdx_memlist))
1371 		return NOTIFY_OK;
1372 
1373 	/*
1374 	 * The TDX memory configuration is static and can not be
1375 	 * changed.  Reject onlining any memory which is outside of
1376 	 * the static configuration whether it supports TDX or not.
1377 	 */
1378 	if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1379 		return NOTIFY_OK;
1380 
1381 	return NOTIFY_BAD;
1382 }
1383 
1384 static struct notifier_block tdx_memory_nb = {
1385 	.notifier_call = tdx_memory_notifier,
1386 };
1387 
check_tdx_erratum(void)1388 static void __init check_tdx_erratum(void)
1389 {
1390 	/*
1391 	 * These CPUs have an erratum.  A partial write from non-TD
1392 	 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1393 	 * private memory poisons that memory, and a subsequent read of
1394 	 * that memory triggers #MC.
1395 	 */
1396 	switch (boot_cpu_data.x86_vfm) {
1397 	case INTEL_SAPPHIRERAPIDS_X:
1398 	case INTEL_EMERALDRAPIDS_X:
1399 		setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1400 	}
1401 }
1402 
tdx_init(void)1403 void __init tdx_init(void)
1404 {
1405 	u32 tdx_keyid_start, nr_tdx_keyids;
1406 	int err;
1407 
1408 	err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1409 	if (err)
1410 		return;
1411 
1412 	pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1413 			tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1414 
1415 	/*
1416 	 * The TDX module itself requires one 'global KeyID' to protect
1417 	 * its metadata.  If there's only one TDX KeyID, there won't be
1418 	 * any left for TDX guests thus there's no point to enable TDX
1419 	 * at all.
1420 	 */
1421 	if (nr_tdx_keyids < 2) {
1422 		pr_err("initialization failed: too few private KeyIDs available.\n");
1423 		return;
1424 	}
1425 
1426 	/*
1427 	 * At this point, hibernation_available() indicates whether or
1428 	 * not hibernation support has been permanently disabled.
1429 	 */
1430 	if (hibernation_available()) {
1431 		pr_err("initialization failed: Hibernation support is enabled\n");
1432 		return;
1433 	}
1434 
1435 	err = register_memory_notifier(&tdx_memory_nb);
1436 	if (err) {
1437 		pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1438 				err);
1439 		return;
1440 	}
1441 
1442 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1443 	pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1444 	acpi_suspend_lowlevel = NULL;
1445 #endif
1446 
1447 	/*
1448 	 * Just use the first TDX KeyID as the 'global KeyID' and
1449 	 * leave the rest for TDX guests.
1450 	 */
1451 	tdx_global_keyid = tdx_keyid_start;
1452 	tdx_guest_keyid_start = tdx_keyid_start + 1;
1453 	tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1454 
1455 	setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1456 
1457 	check_tdx_erratum();
1458 }
1459