1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
4 * No bombay mix was harmed in the writing of this file.
5 *
6 * Copyright (C) 2020 Google LLC
7 * Author: Will Deacon <[email protected]>
8 */
9
10 #include <linux/bitfield.h>
11 #include <asm/kvm_pgtable.h>
12 #include <asm/stage2_pgtable.h>
13
14
15 #define KVM_PTE_TYPE BIT(1)
16 #define KVM_PTE_TYPE_BLOCK 0
17 #define KVM_PTE_TYPE_PAGE 1
18 #define KVM_PTE_TYPE_TABLE 1
19
20 struct kvm_pgtable_walk_data {
21 struct kvm_pgtable_walker *walker;
22
23 const u64 start;
24 u64 addr;
25 const u64 end;
26 };
27
kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx * ctx)28 static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx)
29 {
30 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI);
31 }
32
kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx * ctx)33 static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
34 {
35 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
36 }
37
kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx * ctx,u64 phys)38 static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
39 {
40 u64 granule = kvm_granule_size(ctx->level);
41
42 if (!kvm_level_supports_block_mapping(ctx->level))
43 return false;
44
45 if (granule > (ctx->end - ctx->addr))
46 return false;
47
48 if (!IS_ALIGNED(phys, granule))
49 return false;
50
51 return IS_ALIGNED(ctx->addr, granule);
52 }
53
kvm_pgtable_idx(struct kvm_pgtable_walk_data * data,s8 level)54 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level)
55 {
56 u64 shift = kvm_granule_shift(level);
57 u64 mask = BIT(PAGE_SHIFT - 3) - 1;
58
59 return (data->addr >> shift) & mask;
60 }
61
kvm_pgd_page_idx(struct kvm_pgtable * pgt,u64 addr)62 static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
63 {
64 u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
65 u64 mask = BIT(pgt->ia_bits) - 1;
66
67 return (addr & mask) >> shift;
68 }
69
kvm_pgd_pages(u32 ia_bits,s8 start_level)70 static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level)
71 {
72 struct kvm_pgtable pgt = {
73 .ia_bits = ia_bits,
74 .start_level = start_level,
75 };
76
77 return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
78 }
79
kvm_pte_table(kvm_pte_t pte,s8 level)80 static bool kvm_pte_table(kvm_pte_t pte, s8 level)
81 {
82 if (level == KVM_PGTABLE_LAST_LEVEL)
83 return false;
84
85 if (!kvm_pte_valid(pte))
86 return false;
87
88 return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
89 }
90
kvm_pte_follow(kvm_pte_t pte,struct kvm_pgtable_mm_ops * mm_ops)91 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
92 {
93 return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
94 }
95
kvm_clear_pte(kvm_pte_t * ptep)96 static void kvm_clear_pte(kvm_pte_t *ptep)
97 {
98 WRITE_ONCE(*ptep, 0);
99 }
100
kvm_init_table_pte(kvm_pte_t * childp,struct kvm_pgtable_mm_ops * mm_ops)101 static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops)
102 {
103 kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
104
105 pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
106 pte |= KVM_PTE_VALID;
107 return pte;
108 }
109
kvm_init_valid_leaf_pte(u64 pa,kvm_pte_t attr,s8 level)110 static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level)
111 {
112 kvm_pte_t pte = kvm_phys_to_pte(pa);
113 u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE :
114 KVM_PTE_TYPE_BLOCK;
115
116 pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
117 pte |= FIELD_PREP(KVM_PTE_TYPE, type);
118 pte |= KVM_PTE_VALID;
119
120 return pte;
121 }
122
kvm_init_invalid_leaf_owner(u8 owner_id)123 static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
124 {
125 return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
126 }
127
kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data * data,const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)128 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
129 const struct kvm_pgtable_visit_ctx *ctx,
130 enum kvm_pgtable_walk_flags visit)
131 {
132 struct kvm_pgtable_walker *walker = data->walker;
133
134 /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */
135 WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held());
136 return walker->cb(ctx, visit);
137 }
138
kvm_pgtable_walk_continue(const struct kvm_pgtable_walker * walker,int r)139 static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker,
140 int r)
141 {
142 /*
143 * Visitor callbacks return EAGAIN when the conditions that led to a
144 * fault are no longer reflected in the page tables due to a race to
145 * update a PTE. In the context of a fault handler this is interpreted
146 * as a signal to retry guest execution.
147 *
148 * Ignore the return code altogether for walkers outside a fault handler
149 * (e.g. write protecting a range of memory) and chug along with the
150 * page table walk.
151 */
152 if (r == -EAGAIN)
153 return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT);
154
155 return !r;
156 }
157
158 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
159 struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level);
160
__kvm_pgtable_visit(struct kvm_pgtable_walk_data * data,struct kvm_pgtable_mm_ops * mm_ops,kvm_pteref_t pteref,s8 level)161 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
162 struct kvm_pgtable_mm_ops *mm_ops,
163 kvm_pteref_t pteref, s8 level)
164 {
165 enum kvm_pgtable_walk_flags flags = data->walker->flags;
166 kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref);
167 struct kvm_pgtable_visit_ctx ctx = {
168 .ptep = ptep,
169 .old = READ_ONCE(*ptep),
170 .arg = data->walker->arg,
171 .mm_ops = mm_ops,
172 .start = data->start,
173 .addr = data->addr,
174 .end = data->end,
175 .level = level,
176 .flags = flags,
177 };
178 int ret = 0;
179 bool reload = false;
180 kvm_pteref_t childp;
181 bool table = kvm_pte_table(ctx.old, level);
182
183 if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
184 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE);
185 reload = true;
186 }
187
188 if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) {
189 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF);
190 reload = true;
191 }
192
193 /*
194 * Reload the page table after invoking the walker callback for leaf
195 * entries or after pre-order traversal, to allow the walker to descend
196 * into a newly installed or replaced table.
197 */
198 if (reload) {
199 ctx.old = READ_ONCE(*ptep);
200 table = kvm_pte_table(ctx.old, level);
201 }
202
203 if (!kvm_pgtable_walk_continue(data->walker, ret))
204 goto out;
205
206 if (!table) {
207 data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
208 data->addr += kvm_granule_size(level);
209 goto out;
210 }
211
212 childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops);
213 ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1);
214 if (!kvm_pgtable_walk_continue(data->walker, ret))
215 goto out;
216
217 if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST)
218 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST);
219
220 out:
221 if (kvm_pgtable_walk_continue(data->walker, ret))
222 return 0;
223
224 return ret;
225 }
226
__kvm_pgtable_walk(struct kvm_pgtable_walk_data * data,struct kvm_pgtable_mm_ops * mm_ops,kvm_pteref_t pgtable,s8 level)227 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
228 struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level)
229 {
230 u32 idx;
231 int ret = 0;
232
233 if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL ||
234 level > KVM_PGTABLE_LAST_LEVEL))
235 return -EINVAL;
236
237 for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
238 kvm_pteref_t pteref = &pgtable[idx];
239
240 if (data->addr >= data->end)
241 break;
242
243 ret = __kvm_pgtable_visit(data, mm_ops, pteref, level);
244 if (ret)
245 break;
246 }
247
248 return ret;
249 }
250
_kvm_pgtable_walk(struct kvm_pgtable * pgt,struct kvm_pgtable_walk_data * data)251 static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data)
252 {
253 u32 idx;
254 int ret = 0;
255 u64 limit = BIT(pgt->ia_bits);
256
257 if (data->addr > limit || data->end > limit)
258 return -ERANGE;
259
260 if (!pgt->pgd)
261 return -EINVAL;
262
263 for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) {
264 kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE];
265
266 ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level);
267 if (ret)
268 break;
269 }
270
271 return ret;
272 }
273
kvm_pgtable_walk(struct kvm_pgtable * pgt,u64 addr,u64 size,struct kvm_pgtable_walker * walker)274 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
275 struct kvm_pgtable_walker *walker)
276 {
277 struct kvm_pgtable_walk_data walk_data = {
278 .start = ALIGN_DOWN(addr, PAGE_SIZE),
279 .addr = ALIGN_DOWN(addr, PAGE_SIZE),
280 .end = PAGE_ALIGN(walk_data.addr + size),
281 .walker = walker,
282 };
283 int r;
284
285 r = kvm_pgtable_walk_begin(walker);
286 if (r)
287 return r;
288
289 r = _kvm_pgtable_walk(pgt, &walk_data);
290 kvm_pgtable_walk_end(walker);
291
292 return r;
293 }
294
295 struct leaf_walk_data {
296 kvm_pte_t pte;
297 s8 level;
298 };
299
leaf_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)300 static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
301 enum kvm_pgtable_walk_flags visit)
302 {
303 struct leaf_walk_data *data = ctx->arg;
304
305 data->pte = ctx->old;
306 data->level = ctx->level;
307
308 return 0;
309 }
310
kvm_pgtable_get_leaf(struct kvm_pgtable * pgt,u64 addr,kvm_pte_t * ptep,s8 * level)311 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
312 kvm_pte_t *ptep, s8 *level)
313 {
314 struct leaf_walk_data data;
315 struct kvm_pgtable_walker walker = {
316 .cb = leaf_walker,
317 .flags = KVM_PGTABLE_WALK_LEAF,
318 .arg = &data,
319 };
320 int ret;
321
322 ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
323 PAGE_SIZE, &walker);
324 if (!ret) {
325 if (ptep)
326 *ptep = data.pte;
327 if (level)
328 *level = data.level;
329 }
330
331 return ret;
332 }
333
334 struct hyp_map_data {
335 const u64 phys;
336 kvm_pte_t attr;
337 };
338
hyp_set_prot_attr(enum kvm_pgtable_prot prot,kvm_pte_t * ptep)339 static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
340 {
341 bool device = prot & KVM_PGTABLE_PROT_DEVICE;
342 u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
343 kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
344 u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
345 u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
346 KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
347
348 if (!(prot & KVM_PGTABLE_PROT_R))
349 return -EINVAL;
350
351 if (prot & KVM_PGTABLE_PROT_X) {
352 if (prot & KVM_PGTABLE_PROT_W)
353 return -EINVAL;
354
355 if (device)
356 return -EINVAL;
357
358 if (system_supports_bti_kernel())
359 attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP;
360 } else {
361 attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
362 }
363
364 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
365 if (!kvm_lpa2_is_enabled())
366 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
367 attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
368 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
369 *ptep = attr;
370
371 return 0;
372 }
373
kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)374 enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
375 {
376 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
377 u32 ap;
378
379 if (!kvm_pte_valid(pte))
380 return prot;
381
382 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
383 prot |= KVM_PGTABLE_PROT_X;
384
385 ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
386 if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
387 prot |= KVM_PGTABLE_PROT_R;
388 else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
389 prot |= KVM_PGTABLE_PROT_RW;
390
391 return prot;
392 }
393
hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx * ctx,struct hyp_map_data * data)394 static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
395 struct hyp_map_data *data)
396 {
397 u64 phys = data->phys + (ctx->addr - ctx->start);
398 kvm_pte_t new;
399
400 if (!kvm_block_mapping_supported(ctx, phys))
401 return false;
402
403 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
404 if (ctx->old == new)
405 return true;
406 if (!kvm_pte_valid(ctx->old))
407 ctx->mm_ops->get_page(ctx->ptep);
408 else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
409 return false;
410
411 smp_store_release(ctx->ptep, new);
412 return true;
413 }
414
hyp_map_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)415 static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
416 enum kvm_pgtable_walk_flags visit)
417 {
418 kvm_pte_t *childp, new;
419 struct hyp_map_data *data = ctx->arg;
420 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
421
422 if (hyp_map_walker_try_leaf(ctx, data))
423 return 0;
424
425 if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
426 return -EINVAL;
427
428 childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
429 if (!childp)
430 return -ENOMEM;
431
432 new = kvm_init_table_pte(childp, mm_ops);
433 mm_ops->get_page(ctx->ptep);
434 smp_store_release(ctx->ptep, new);
435
436 return 0;
437 }
438
kvm_pgtable_hyp_map(struct kvm_pgtable * pgt,u64 addr,u64 size,u64 phys,enum kvm_pgtable_prot prot)439 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
440 enum kvm_pgtable_prot prot)
441 {
442 int ret;
443 struct hyp_map_data map_data = {
444 .phys = ALIGN_DOWN(phys, PAGE_SIZE),
445 };
446 struct kvm_pgtable_walker walker = {
447 .cb = hyp_map_walker,
448 .flags = KVM_PGTABLE_WALK_LEAF,
449 .arg = &map_data,
450 };
451
452 ret = hyp_set_prot_attr(prot, &map_data.attr);
453 if (ret)
454 return ret;
455
456 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
457 dsb(ishst);
458 isb();
459 return ret;
460 }
461
hyp_unmap_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)462 static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
463 enum kvm_pgtable_walk_flags visit)
464 {
465 kvm_pte_t *childp = NULL;
466 u64 granule = kvm_granule_size(ctx->level);
467 u64 *unmapped = ctx->arg;
468 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
469
470 if (!kvm_pte_valid(ctx->old))
471 return -EINVAL;
472
473 if (kvm_pte_table(ctx->old, ctx->level)) {
474 childp = kvm_pte_follow(ctx->old, mm_ops);
475
476 if (mm_ops->page_count(childp) != 1)
477 return 0;
478
479 kvm_clear_pte(ctx->ptep);
480 dsb(ishst);
481 __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN);
482 } else {
483 if (ctx->end - ctx->addr < granule)
484 return -EINVAL;
485
486 kvm_clear_pte(ctx->ptep);
487 dsb(ishst);
488 __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
489 *unmapped += granule;
490 }
491
492 dsb(ish);
493 isb();
494 mm_ops->put_page(ctx->ptep);
495
496 if (childp)
497 mm_ops->put_page(childp);
498
499 return 0;
500 }
501
kvm_pgtable_hyp_unmap(struct kvm_pgtable * pgt,u64 addr,u64 size)502 u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
503 {
504 u64 unmapped = 0;
505 struct kvm_pgtable_walker walker = {
506 .cb = hyp_unmap_walker,
507 .arg = &unmapped,
508 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
509 };
510
511 if (!pgt->mm_ops->page_count)
512 return 0;
513
514 kvm_pgtable_walk(pgt, addr, size, &walker);
515 return unmapped;
516 }
517
kvm_pgtable_hyp_init(struct kvm_pgtable * pgt,u32 va_bits,struct kvm_pgtable_mm_ops * mm_ops)518 int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
519 struct kvm_pgtable_mm_ops *mm_ops)
520 {
521 s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 -
522 ARM64_HW_PGTABLE_LEVELS(va_bits);
523
524 if (start_level < KVM_PGTABLE_FIRST_LEVEL ||
525 start_level > KVM_PGTABLE_LAST_LEVEL)
526 return -EINVAL;
527
528 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL);
529 if (!pgt->pgd)
530 return -ENOMEM;
531
532 pgt->ia_bits = va_bits;
533 pgt->start_level = start_level;
534 pgt->mm_ops = mm_ops;
535 pgt->mmu = NULL;
536 pgt->force_pte_cb = NULL;
537
538 return 0;
539 }
540
hyp_free_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)541 static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
542 enum kvm_pgtable_walk_flags visit)
543 {
544 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
545
546 if (!kvm_pte_valid(ctx->old))
547 return 0;
548
549 mm_ops->put_page(ctx->ptep);
550
551 if (kvm_pte_table(ctx->old, ctx->level))
552 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
553
554 return 0;
555 }
556
kvm_pgtable_hyp_destroy(struct kvm_pgtable * pgt)557 void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
558 {
559 struct kvm_pgtable_walker walker = {
560 .cb = hyp_free_walker,
561 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
562 };
563
564 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
565 pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd));
566 pgt->pgd = NULL;
567 }
568
569 struct stage2_map_data {
570 const u64 phys;
571 kvm_pte_t attr;
572 u8 owner_id;
573
574 kvm_pte_t *anchor;
575 kvm_pte_t *childp;
576
577 struct kvm_s2_mmu *mmu;
578 void *memcache;
579
580 /* Force mappings to page granularity */
581 bool force_pte;
582
583 /* Walk should update owner_id only */
584 bool annotation;
585 };
586
kvm_get_vtcr(u64 mmfr0,u64 mmfr1,u32 phys_shift)587 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
588 {
589 u64 vtcr = VTCR_EL2_FLAGS;
590 s8 lvls;
591
592 vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
593 vtcr |= VTCR_EL2_T0SZ(phys_shift);
594 /*
595 * Use a minimum 2 level page table to prevent splitting
596 * host PMD huge pages at stage2.
597 */
598 lvls = stage2_pgtable_levels(phys_shift);
599 if (lvls < 2)
600 lvls = 2;
601
602 /*
603 * When LPA2 is enabled, the HW supports an extra level of translation
604 * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2
605 * to as an addition to SL0 to enable encoding this extra start level.
606 * However, since we always use concatenated pages for the first level
607 * lookup, we will never need this extra level and therefore do not need
608 * to touch SL2.
609 */
610 vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
611
612 #ifdef CONFIG_ARM64_HW_AFDBM
613 /*
614 * Enable the Hardware Access Flag management, unconditionally
615 * on all CPUs. In systems that have asymmetric support for the feature
616 * this allows KVM to leverage hardware support on the subset of cores
617 * that implement the feature.
618 *
619 * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by
620 * hardware) on implementations that do not advertise support for the
621 * feature. As such, setting HA unconditionally is safe, unless you
622 * happen to be running on a design that has unadvertised support for
623 * HAFDBS. Here be dragons.
624 */
625 if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
626 vtcr |= VTCR_EL2_HA;
627 #endif /* CONFIG_ARM64_HW_AFDBM */
628
629 if (kvm_lpa2_is_enabled())
630 vtcr |= VTCR_EL2_DS;
631
632 /* Set the vmid bits */
633 vtcr |= (get_vmid_bits(mmfr1) == 16) ?
634 VTCR_EL2_VS_16BIT :
635 VTCR_EL2_VS_8BIT;
636
637 return vtcr;
638 }
639
stage2_has_fwb(struct kvm_pgtable * pgt)640 static bool stage2_has_fwb(struct kvm_pgtable *pgt)
641 {
642 if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
643 return false;
644
645 return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
646 }
647
kvm_tlb_flush_vmid_range(struct kvm_s2_mmu * mmu,phys_addr_t addr,size_t size)648 void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
649 phys_addr_t addr, size_t size)
650 {
651 unsigned long pages, inval_pages;
652
653 if (!system_supports_tlb_range()) {
654 kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
655 return;
656 }
657
658 pages = size >> PAGE_SHIFT;
659 while (pages > 0) {
660 inval_pages = min(pages, MAX_TLBI_RANGE_PAGES);
661 kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages);
662
663 addr += inval_pages << PAGE_SHIFT;
664 pages -= inval_pages;
665 }
666 }
667
668 #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
669
stage2_set_prot_attr(struct kvm_pgtable * pgt,enum kvm_pgtable_prot prot,kvm_pte_t * ptep)670 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
671 kvm_pte_t *ptep)
672 {
673 kvm_pte_t attr;
674 u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
675
676 switch (prot & (KVM_PGTABLE_PROT_DEVICE |
677 KVM_PGTABLE_PROT_NORMAL_NC)) {
678 case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
679 return -EINVAL;
680 case KVM_PGTABLE_PROT_DEVICE:
681 if (prot & KVM_PGTABLE_PROT_X)
682 return -EINVAL;
683 attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE);
684 break;
685 case KVM_PGTABLE_PROT_NORMAL_NC:
686 if (prot & KVM_PGTABLE_PROT_X)
687 return -EINVAL;
688 attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
689 break;
690 default:
691 attr = KVM_S2_MEMATTR(pgt, NORMAL);
692 }
693
694 if (!(prot & KVM_PGTABLE_PROT_X))
695 attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
696
697 if (prot & KVM_PGTABLE_PROT_R)
698 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
699
700 if (prot & KVM_PGTABLE_PROT_W)
701 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
702
703 if (!kvm_lpa2_is_enabled())
704 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
705
706 attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
707 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
708 *ptep = attr;
709
710 return 0;
711 }
712
kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)713 enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
714 {
715 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
716
717 if (!kvm_pte_valid(pte))
718 return prot;
719
720 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
721 prot |= KVM_PGTABLE_PROT_R;
722 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
723 prot |= KVM_PGTABLE_PROT_W;
724 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
725 prot |= KVM_PGTABLE_PROT_X;
726
727 return prot;
728 }
729
stage2_pte_needs_update(kvm_pte_t old,kvm_pte_t new)730 static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
731 {
732 if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
733 return true;
734
735 return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
736 }
737
stage2_pte_is_counted(kvm_pte_t pte)738 static bool stage2_pte_is_counted(kvm_pte_t pte)
739 {
740 /*
741 * The refcount tracks valid entries as well as invalid entries if they
742 * encode ownership of a page to another entity than the page-table
743 * owner, whose id is 0.
744 */
745 return !!pte;
746 }
747
stage2_pte_is_locked(kvm_pte_t pte)748 static bool stage2_pte_is_locked(kvm_pte_t pte)
749 {
750 return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
751 }
752
stage2_try_set_pte(const struct kvm_pgtable_visit_ctx * ctx,kvm_pte_t new)753 static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
754 {
755 if (!kvm_pgtable_walk_shared(ctx)) {
756 WRITE_ONCE(*ctx->ptep, new);
757 return true;
758 }
759
760 return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
761 }
762
763 /**
764 * stage2_try_break_pte() - Invalidates a pte according to the
765 * 'break-before-make' requirements of the
766 * architecture.
767 *
768 * @ctx: context of the visited pte.
769 * @mmu: stage-2 mmu
770 *
771 * Returns: true if the pte was successfully broken.
772 *
773 * If the removed pte was valid, performs the necessary serialization and TLB
774 * invalidation for the old value. For counted ptes, drops the reference count
775 * on the containing table page.
776 */
stage2_try_break_pte(const struct kvm_pgtable_visit_ctx * ctx,struct kvm_s2_mmu * mmu)777 static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
778 struct kvm_s2_mmu *mmu)
779 {
780 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
781
782 if (stage2_pte_is_locked(ctx->old)) {
783 /*
784 * Should never occur if this walker has exclusive access to the
785 * page tables.
786 */
787 WARN_ON(!kvm_pgtable_walk_shared(ctx));
788 return false;
789 }
790
791 if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
792 return false;
793
794 if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
795 /*
796 * Perform the appropriate TLB invalidation based on the
797 * evicted pte value (if any).
798 */
799 if (kvm_pte_table(ctx->old, ctx->level)) {
800 u64 size = kvm_granule_size(ctx->level);
801 u64 addr = ALIGN_DOWN(ctx->addr, size);
802
803 kvm_tlb_flush_vmid_range(mmu, addr, size);
804 } else if (kvm_pte_valid(ctx->old)) {
805 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
806 ctx->addr, ctx->level);
807 }
808 }
809
810 if (stage2_pte_is_counted(ctx->old))
811 mm_ops->put_page(ctx->ptep);
812
813 return true;
814 }
815
stage2_make_pte(const struct kvm_pgtable_visit_ctx * ctx,kvm_pte_t new)816 static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
817 {
818 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
819
820 WARN_ON(!stage2_pte_is_locked(*ctx->ptep));
821
822 if (stage2_pte_is_counted(new))
823 mm_ops->get_page(ctx->ptep);
824
825 smp_store_release(ctx->ptep, new);
826 }
827
stage2_unmap_defer_tlb_flush(struct kvm_pgtable * pgt)828 static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
829 {
830 /*
831 * If FEAT_TLBIRANGE is implemented, defer the individual
832 * TLB invalidations until the entire walk is finished, and
833 * then use the range-based TLBI instructions to do the
834 * invalidations. Condition deferred TLB invalidation on the
835 * system supporting FWB as the optimization is entirely
836 * pointless when the unmap walker needs to perform CMOs.
837 */
838 return system_supports_tlb_range() && stage2_has_fwb(pgt);
839 }
840
stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx * ctx,struct kvm_s2_mmu * mmu,struct kvm_pgtable_mm_ops * mm_ops)841 static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
842 struct kvm_s2_mmu *mmu,
843 struct kvm_pgtable_mm_ops *mm_ops)
844 {
845 struct kvm_pgtable *pgt = ctx->arg;
846
847 /*
848 * Clear the existing PTE, and perform break-before-make if it was
849 * valid. Depending on the system support, defer the TLB maintenance
850 * for the same until the entire unmap walk is completed.
851 */
852 if (kvm_pte_valid(ctx->old)) {
853 kvm_clear_pte(ctx->ptep);
854
855 if (kvm_pte_table(ctx->old, ctx->level)) {
856 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
857 TLBI_TTL_UNKNOWN);
858 } else if (!stage2_unmap_defer_tlb_flush(pgt)) {
859 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
860 ctx->level);
861 }
862 }
863
864 mm_ops->put_page(ctx->ptep);
865 }
866
stage2_pte_cacheable(struct kvm_pgtable * pgt,kvm_pte_t pte)867 static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
868 {
869 u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
870 return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL);
871 }
872
stage2_pte_executable(kvm_pte_t pte)873 static bool stage2_pte_executable(kvm_pte_t pte)
874 {
875 return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
876 }
877
stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx * ctx,const struct stage2_map_data * data)878 static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
879 const struct stage2_map_data *data)
880 {
881 u64 phys = data->phys;
882
883 /* Work out the correct PA based on how far the walk has gotten */
884 return phys + (ctx->addr - ctx->start);
885 }
886
stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx * ctx,struct stage2_map_data * data)887 static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
888 struct stage2_map_data *data)
889 {
890 u64 phys = stage2_map_walker_phys_addr(ctx, data);
891
892 if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL)
893 return false;
894
895 if (data->annotation)
896 return true;
897
898 return kvm_block_mapping_supported(ctx, phys);
899 }
900
stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx * ctx,struct stage2_map_data * data)901 static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
902 struct stage2_map_data *data)
903 {
904 kvm_pte_t new;
905 u64 phys = stage2_map_walker_phys_addr(ctx, data);
906 u64 granule = kvm_granule_size(ctx->level);
907 struct kvm_pgtable *pgt = data->mmu->pgt;
908 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
909
910 if (!stage2_leaf_mapping_allowed(ctx, data))
911 return -E2BIG;
912
913 if (!data->annotation)
914 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
915 else
916 new = kvm_init_invalid_leaf_owner(data->owner_id);
917
918 /*
919 * Skip updating the PTE if we are trying to recreate the exact
920 * same mapping or only change the access permissions. Instead,
921 * the vCPU will exit one more time from guest if still needed
922 * and then go through the path of relaxing permissions.
923 */
924 if (!stage2_pte_needs_update(ctx->old, new))
925 return -EAGAIN;
926
927 /* If we're only changing software bits, then store them and go! */
928 if (!kvm_pgtable_walk_shared(ctx) &&
929 !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) {
930 bool old_is_counted = stage2_pte_is_counted(ctx->old);
931
932 if (old_is_counted != stage2_pte_is_counted(new)) {
933 if (old_is_counted)
934 mm_ops->put_page(ctx->ptep);
935 else
936 mm_ops->get_page(ctx->ptep);
937 }
938 WARN_ON_ONCE(!stage2_try_set_pte(ctx, new));
939 return 0;
940 }
941
942 if (!stage2_try_break_pte(ctx, data->mmu))
943 return -EAGAIN;
944
945 /* Perform CMOs before installation of the guest stage-2 PTE */
946 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
947 stage2_pte_cacheable(pgt, new))
948 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
949 granule);
950
951 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
952 stage2_pte_executable(new))
953 mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
954
955 stage2_make_pte(ctx, new);
956
957 return 0;
958 }
959
stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx * ctx,struct stage2_map_data * data)960 static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
961 struct stage2_map_data *data)
962 {
963 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
964 kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
965 int ret;
966
967 if (!stage2_leaf_mapping_allowed(ctx, data))
968 return 0;
969
970 ret = stage2_map_walker_try_leaf(ctx, data);
971 if (ret)
972 return ret;
973
974 mm_ops->free_unlinked_table(childp, ctx->level);
975 return 0;
976 }
977
stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx * ctx,struct stage2_map_data * data)978 static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
979 struct stage2_map_data *data)
980 {
981 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
982 kvm_pte_t *childp, new;
983 int ret;
984
985 ret = stage2_map_walker_try_leaf(ctx, data);
986 if (ret != -E2BIG)
987 return ret;
988
989 if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
990 return -EINVAL;
991
992 if (!data->memcache)
993 return -ENOMEM;
994
995 childp = mm_ops->zalloc_page(data->memcache);
996 if (!childp)
997 return -ENOMEM;
998
999 if (!stage2_try_break_pte(ctx, data->mmu)) {
1000 mm_ops->put_page(childp);
1001 return -EAGAIN;
1002 }
1003
1004 /*
1005 * If we've run into an existing block mapping then replace it with
1006 * a table. Accesses beyond 'end' that fall within the new table
1007 * will be mapped lazily.
1008 */
1009 new = kvm_init_table_pte(childp, mm_ops);
1010 stage2_make_pte(ctx, new);
1011
1012 return 0;
1013 }
1014
1015 /*
1016 * The TABLE_PRE callback runs for table entries on the way down, looking
1017 * for table entries which we could conceivably replace with a block entry
1018 * for this mapping. If it finds one it replaces the entry and calls
1019 * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table.
1020 *
1021 * Otherwise, the LEAF callback performs the mapping at the existing leaves
1022 * instead.
1023 */
stage2_map_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1024 static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
1025 enum kvm_pgtable_walk_flags visit)
1026 {
1027 struct stage2_map_data *data = ctx->arg;
1028
1029 switch (visit) {
1030 case KVM_PGTABLE_WALK_TABLE_PRE:
1031 return stage2_map_walk_table_pre(ctx, data);
1032 case KVM_PGTABLE_WALK_LEAF:
1033 return stage2_map_walk_leaf(ctx, data);
1034 default:
1035 return -EINVAL;
1036 }
1037 }
1038
kvm_pgtable_stage2_map(struct kvm_pgtable * pgt,u64 addr,u64 size,u64 phys,enum kvm_pgtable_prot prot,void * mc,enum kvm_pgtable_walk_flags flags)1039 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
1040 u64 phys, enum kvm_pgtable_prot prot,
1041 void *mc, enum kvm_pgtable_walk_flags flags)
1042 {
1043 int ret;
1044 struct stage2_map_data map_data = {
1045 .phys = ALIGN_DOWN(phys, PAGE_SIZE),
1046 .mmu = pgt->mmu,
1047 .memcache = mc,
1048 .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
1049 };
1050 struct kvm_pgtable_walker walker = {
1051 .cb = stage2_map_walker,
1052 .flags = flags |
1053 KVM_PGTABLE_WALK_TABLE_PRE |
1054 KVM_PGTABLE_WALK_LEAF,
1055 .arg = &map_data,
1056 };
1057
1058 if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
1059 return -EINVAL;
1060
1061 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
1062 if (ret)
1063 return ret;
1064
1065 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1066 dsb(ishst);
1067 return ret;
1068 }
1069
kvm_pgtable_stage2_set_owner(struct kvm_pgtable * pgt,u64 addr,u64 size,void * mc,u8 owner_id)1070 int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
1071 void *mc, u8 owner_id)
1072 {
1073 int ret;
1074 struct stage2_map_data map_data = {
1075 .mmu = pgt->mmu,
1076 .memcache = mc,
1077 .owner_id = owner_id,
1078 .force_pte = true,
1079 .annotation = true,
1080 };
1081 struct kvm_pgtable_walker walker = {
1082 .cb = stage2_map_walker,
1083 .flags = KVM_PGTABLE_WALK_TABLE_PRE |
1084 KVM_PGTABLE_WALK_LEAF,
1085 .arg = &map_data,
1086 };
1087
1088 if (owner_id > KVM_MAX_OWNER_ID)
1089 return -EINVAL;
1090
1091 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1092 return ret;
1093 }
1094
stage2_unmap_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1095 static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
1096 enum kvm_pgtable_walk_flags visit)
1097 {
1098 struct kvm_pgtable *pgt = ctx->arg;
1099 struct kvm_s2_mmu *mmu = pgt->mmu;
1100 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1101 kvm_pte_t *childp = NULL;
1102 bool need_flush = false;
1103
1104 if (!kvm_pte_valid(ctx->old)) {
1105 if (stage2_pte_is_counted(ctx->old)) {
1106 kvm_clear_pte(ctx->ptep);
1107 mm_ops->put_page(ctx->ptep);
1108 }
1109 return 0;
1110 }
1111
1112 if (kvm_pte_table(ctx->old, ctx->level)) {
1113 childp = kvm_pte_follow(ctx->old, mm_ops);
1114
1115 if (mm_ops->page_count(childp) != 1)
1116 return 0;
1117 } else if (stage2_pte_cacheable(pgt, ctx->old)) {
1118 need_flush = !stage2_has_fwb(pgt);
1119 }
1120
1121 /*
1122 * This is similar to the map() path in that we unmap the entire
1123 * block entry and rely on the remaining portions being faulted
1124 * back lazily.
1125 */
1126 stage2_unmap_put_pte(ctx, mmu, mm_ops);
1127
1128 if (need_flush && mm_ops->dcache_clean_inval_poc)
1129 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
1130 kvm_granule_size(ctx->level));
1131
1132 if (childp)
1133 mm_ops->put_page(childp);
1134
1135 return 0;
1136 }
1137
kvm_pgtable_stage2_unmap(struct kvm_pgtable * pgt,u64 addr,u64 size)1138 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
1139 {
1140 int ret;
1141 struct kvm_pgtable_walker walker = {
1142 .cb = stage2_unmap_walker,
1143 .arg = pgt,
1144 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
1145 };
1146
1147 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1148 if (stage2_unmap_defer_tlb_flush(pgt))
1149 /* Perform the deferred TLB invalidations */
1150 kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
1151
1152 return ret;
1153 }
1154
1155 struct stage2_attr_data {
1156 kvm_pte_t attr_set;
1157 kvm_pte_t attr_clr;
1158 kvm_pte_t pte;
1159 s8 level;
1160 };
1161
stage2_attr_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1162 static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
1163 enum kvm_pgtable_walk_flags visit)
1164 {
1165 kvm_pte_t pte = ctx->old;
1166 struct stage2_attr_data *data = ctx->arg;
1167 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1168
1169 if (!kvm_pte_valid(ctx->old))
1170 return -EAGAIN;
1171
1172 data->level = ctx->level;
1173 data->pte = pte;
1174 pte &= ~data->attr_clr;
1175 pte |= data->attr_set;
1176
1177 /*
1178 * We may race with the CPU trying to set the access flag here,
1179 * but worst-case the access flag update gets lost and will be
1180 * set on the next access instead.
1181 */
1182 if (data->pte != pte) {
1183 /*
1184 * Invalidate instruction cache before updating the guest
1185 * stage-2 PTE if we are going to add executable permission.
1186 */
1187 if (mm_ops->icache_inval_pou &&
1188 stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
1189 mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
1190 kvm_granule_size(ctx->level));
1191
1192 if (!stage2_try_set_pte(ctx, pte))
1193 return -EAGAIN;
1194 }
1195
1196 return 0;
1197 }
1198
stage2_update_leaf_attrs(struct kvm_pgtable * pgt,u64 addr,u64 size,kvm_pte_t attr_set,kvm_pte_t attr_clr,kvm_pte_t * orig_pte,s8 * level,enum kvm_pgtable_walk_flags flags)1199 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
1200 u64 size, kvm_pte_t attr_set,
1201 kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
1202 s8 *level, enum kvm_pgtable_walk_flags flags)
1203 {
1204 int ret;
1205 kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
1206 struct stage2_attr_data data = {
1207 .attr_set = attr_set & attr_mask,
1208 .attr_clr = attr_clr & attr_mask,
1209 };
1210 struct kvm_pgtable_walker walker = {
1211 .cb = stage2_attr_walker,
1212 .arg = &data,
1213 .flags = flags | KVM_PGTABLE_WALK_LEAF,
1214 };
1215
1216 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1217 if (ret)
1218 return ret;
1219
1220 if (orig_pte)
1221 *orig_pte = data.pte;
1222
1223 if (level)
1224 *level = data.level;
1225 return 0;
1226 }
1227
kvm_pgtable_stage2_wrprotect(struct kvm_pgtable * pgt,u64 addr,u64 size)1228 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
1229 {
1230 return stage2_update_leaf_attrs(pgt, addr, size, 0,
1231 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
1232 NULL, NULL, 0);
1233 }
1234
kvm_pgtable_stage2_mkyoung(struct kvm_pgtable * pgt,u64 addr,enum kvm_pgtable_walk_flags flags)1235 void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
1236 enum kvm_pgtable_walk_flags flags)
1237 {
1238 int ret;
1239
1240 ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
1241 NULL, NULL, flags);
1242 if (!ret)
1243 dsb(ishst);
1244 }
1245
1246 struct stage2_age_data {
1247 bool mkold;
1248 bool young;
1249 };
1250
stage2_age_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1251 static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
1252 enum kvm_pgtable_walk_flags visit)
1253 {
1254 kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
1255 struct stage2_age_data *data = ctx->arg;
1256
1257 if (!kvm_pte_valid(ctx->old) || new == ctx->old)
1258 return 0;
1259
1260 data->young = true;
1261
1262 /*
1263 * stage2_age_walker() is always called while holding the MMU lock for
1264 * write, so this will always succeed. Nonetheless, this deliberately
1265 * follows the race detection pattern of the other stage-2 walkers in
1266 * case the locking mechanics of the MMU notifiers is ever changed.
1267 */
1268 if (data->mkold && !stage2_try_set_pte(ctx, new))
1269 return -EAGAIN;
1270
1271 /*
1272 * "But where's the TLBI?!", you scream.
1273 * "Over in the core code", I sigh.
1274 *
1275 * See the '->clear_flush_young()' callback on the KVM mmu notifier.
1276 */
1277 return 0;
1278 }
1279
kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable * pgt,u64 addr,u64 size,bool mkold)1280 bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
1281 u64 size, bool mkold)
1282 {
1283 struct stage2_age_data data = {
1284 .mkold = mkold,
1285 };
1286 struct kvm_pgtable_walker walker = {
1287 .cb = stage2_age_walker,
1288 .arg = &data,
1289 .flags = KVM_PGTABLE_WALK_LEAF,
1290 };
1291
1292 WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
1293 return data.young;
1294 }
1295
kvm_pgtable_stage2_relax_perms(struct kvm_pgtable * pgt,u64 addr,enum kvm_pgtable_prot prot,enum kvm_pgtable_walk_flags flags)1296 int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
1297 enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags)
1298 {
1299 int ret;
1300 s8 level;
1301 kvm_pte_t set = 0, clr = 0;
1302
1303 if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
1304 return -EINVAL;
1305
1306 if (prot & KVM_PGTABLE_PROT_R)
1307 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
1308
1309 if (prot & KVM_PGTABLE_PROT_W)
1310 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
1311
1312 if (prot & KVM_PGTABLE_PROT_X)
1313 clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
1314
1315 ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags);
1316 if (!ret || ret == -EAGAIN)
1317 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
1318 return ret;
1319 }
1320
stage2_flush_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1321 static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
1322 enum kvm_pgtable_walk_flags visit)
1323 {
1324 struct kvm_pgtable *pgt = ctx->arg;
1325 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
1326
1327 if (!stage2_pte_cacheable(pgt, ctx->old))
1328 return 0;
1329
1330 if (mm_ops->dcache_clean_inval_poc)
1331 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
1332 kvm_granule_size(ctx->level));
1333 return 0;
1334 }
1335
kvm_pgtable_stage2_flush(struct kvm_pgtable * pgt,u64 addr,u64 size)1336 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
1337 {
1338 struct kvm_pgtable_walker walker = {
1339 .cb = stage2_flush_walker,
1340 .flags = KVM_PGTABLE_WALK_LEAF,
1341 .arg = pgt,
1342 };
1343
1344 if (stage2_has_fwb(pgt))
1345 return 0;
1346
1347 return kvm_pgtable_walk(pgt, addr, size, &walker);
1348 }
1349
kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable * pgt,u64 phys,s8 level,enum kvm_pgtable_prot prot,void * mc,bool force_pte)1350 kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
1351 u64 phys, s8 level,
1352 enum kvm_pgtable_prot prot,
1353 void *mc, bool force_pte)
1354 {
1355 struct stage2_map_data map_data = {
1356 .phys = phys,
1357 .mmu = pgt->mmu,
1358 .memcache = mc,
1359 .force_pte = force_pte,
1360 };
1361 struct kvm_pgtable_walker walker = {
1362 .cb = stage2_map_walker,
1363 .flags = KVM_PGTABLE_WALK_LEAF |
1364 KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
1365 KVM_PGTABLE_WALK_SKIP_CMO,
1366 .arg = &map_data,
1367 };
1368 /*
1369 * The input address (.addr) is irrelevant for walking an
1370 * unlinked table. Construct an ambiguous IA range to map
1371 * kvm_granule_size(level) worth of memory.
1372 */
1373 struct kvm_pgtable_walk_data data = {
1374 .walker = &walker,
1375 .addr = 0,
1376 .end = kvm_granule_size(level),
1377 };
1378 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
1379 kvm_pte_t *pgtable;
1380 int ret;
1381
1382 if (!IS_ALIGNED(phys, kvm_granule_size(level)))
1383 return ERR_PTR(-EINVAL);
1384
1385 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
1386 if (ret)
1387 return ERR_PTR(ret);
1388
1389 pgtable = mm_ops->zalloc_page(mc);
1390 if (!pgtable)
1391 return ERR_PTR(-ENOMEM);
1392
1393 ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable,
1394 level + 1);
1395 if (ret) {
1396 kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level);
1397 return ERR_PTR(ret);
1398 }
1399
1400 return pgtable;
1401 }
1402
1403 /*
1404 * Get the number of page-tables needed to replace a block with a
1405 * fully populated tree up to the PTE entries. Note that @level is
1406 * interpreted as in "level @level entry".
1407 */
stage2_block_get_nr_page_tables(s8 level)1408 static int stage2_block_get_nr_page_tables(s8 level)
1409 {
1410 switch (level) {
1411 case 1:
1412 return PTRS_PER_PTE + 1;
1413 case 2:
1414 return 1;
1415 case 3:
1416 return 0;
1417 default:
1418 WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
1419 level > KVM_PGTABLE_LAST_LEVEL);
1420 return -EINVAL;
1421 };
1422 }
1423
stage2_split_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1424 static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
1425 enum kvm_pgtable_walk_flags visit)
1426 {
1427 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1428 struct kvm_mmu_memory_cache *mc = ctx->arg;
1429 struct kvm_s2_mmu *mmu;
1430 kvm_pte_t pte = ctx->old, new, *childp;
1431 enum kvm_pgtable_prot prot;
1432 s8 level = ctx->level;
1433 bool force_pte;
1434 int nr_pages;
1435 u64 phys;
1436
1437 /* No huge-pages exist at the last level */
1438 if (level == KVM_PGTABLE_LAST_LEVEL)
1439 return 0;
1440
1441 /* We only split valid block mappings */
1442 if (!kvm_pte_valid(pte))
1443 return 0;
1444
1445 nr_pages = stage2_block_get_nr_page_tables(level);
1446 if (nr_pages < 0)
1447 return nr_pages;
1448
1449 if (mc->nobjs >= nr_pages) {
1450 /* Build a tree mapped down to the PTE granularity. */
1451 force_pte = true;
1452 } else {
1453 /*
1454 * Don't force PTEs, so create_unlinked() below does
1455 * not populate the tree up to the PTE level. The
1456 * consequence is that the call will require a single
1457 * page of level 2 entries at level 1, or a single
1458 * page of PTEs at level 2. If we are at level 1, the
1459 * PTEs will be created recursively.
1460 */
1461 force_pte = false;
1462 nr_pages = 1;
1463 }
1464
1465 if (mc->nobjs < nr_pages)
1466 return -ENOMEM;
1467
1468 mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
1469 phys = kvm_pte_to_phys(pte);
1470 prot = kvm_pgtable_stage2_pte_prot(pte);
1471
1472 childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
1473 level, prot, mc, force_pte);
1474 if (IS_ERR(childp))
1475 return PTR_ERR(childp);
1476
1477 if (!stage2_try_break_pte(ctx, mmu)) {
1478 kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
1479 return -EAGAIN;
1480 }
1481
1482 /*
1483 * Note, the contents of the page table are guaranteed to be made
1484 * visible before the new PTE is assigned because stage2_make_pte()
1485 * writes the PTE using smp_store_release().
1486 */
1487 new = kvm_init_table_pte(childp, mm_ops);
1488 stage2_make_pte(ctx, new);
1489 return 0;
1490 }
1491
kvm_pgtable_stage2_split(struct kvm_pgtable * pgt,u64 addr,u64 size,struct kvm_mmu_memory_cache * mc)1492 int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
1493 struct kvm_mmu_memory_cache *mc)
1494 {
1495 struct kvm_pgtable_walker walker = {
1496 .cb = stage2_split_walker,
1497 .flags = KVM_PGTABLE_WALK_LEAF,
1498 .arg = mc,
1499 };
1500 int ret;
1501
1502 ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1503 dsb(ishst);
1504 return ret;
1505 }
1506
__kvm_pgtable_stage2_init(struct kvm_pgtable * pgt,struct kvm_s2_mmu * mmu,struct kvm_pgtable_mm_ops * mm_ops,enum kvm_pgtable_stage2_flags flags,kvm_pgtable_force_pte_cb_t force_pte_cb)1507 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
1508 struct kvm_pgtable_mm_ops *mm_ops,
1509 enum kvm_pgtable_stage2_flags flags,
1510 kvm_pgtable_force_pte_cb_t force_pte_cb)
1511 {
1512 size_t pgd_sz;
1513 u64 vtcr = mmu->vtcr;
1514 u32 ia_bits = VTCR_EL2_IPA(vtcr);
1515 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1516 s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1517
1518 pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1519 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz);
1520 if (!pgt->pgd)
1521 return -ENOMEM;
1522
1523 pgt->ia_bits = ia_bits;
1524 pgt->start_level = start_level;
1525 pgt->mm_ops = mm_ops;
1526 pgt->mmu = mmu;
1527 pgt->flags = flags;
1528 pgt->force_pte_cb = force_pte_cb;
1529
1530 /* Ensure zeroed PGD pages are visible to the hardware walker */
1531 dsb(ishst);
1532 return 0;
1533 }
1534
kvm_pgtable_stage2_pgd_size(u64 vtcr)1535 size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
1536 {
1537 u32 ia_bits = VTCR_EL2_IPA(vtcr);
1538 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1539 s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1540
1541 return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1542 }
1543
stage2_free_walker(const struct kvm_pgtable_visit_ctx * ctx,enum kvm_pgtable_walk_flags visit)1544 static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
1545 enum kvm_pgtable_walk_flags visit)
1546 {
1547 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1548
1549 if (!stage2_pte_is_counted(ctx->old))
1550 return 0;
1551
1552 mm_ops->put_page(ctx->ptep);
1553
1554 if (kvm_pte_table(ctx->old, ctx->level))
1555 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
1556
1557 return 0;
1558 }
1559
kvm_pgtable_stage2_destroy(struct kvm_pgtable * pgt)1560 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
1561 {
1562 size_t pgd_sz;
1563 struct kvm_pgtable_walker walker = {
1564 .cb = stage2_free_walker,
1565 .flags = KVM_PGTABLE_WALK_LEAF |
1566 KVM_PGTABLE_WALK_TABLE_POST,
1567 };
1568
1569 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
1570 pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
1571 pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
1572 pgt->pgd = NULL;
1573 }
1574
kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops * mm_ops,void * pgtable,s8 level)1575 void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
1576 {
1577 kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
1578 struct kvm_pgtable_walker walker = {
1579 .cb = stage2_free_walker,
1580 .flags = KVM_PGTABLE_WALK_LEAF |
1581 KVM_PGTABLE_WALK_TABLE_POST,
1582 };
1583 struct kvm_pgtable_walk_data data = {
1584 .walker = &walker,
1585
1586 /*
1587 * At this point the IPA really doesn't matter, as the page
1588 * table being traversed has already been removed from the stage
1589 * 2. Set an appropriate range to cover the entire page table.
1590 */
1591 .addr = 0,
1592 .end = kvm_granule_size(level),
1593 };
1594
1595 WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1));
1596
1597 WARN_ON(mm_ops->page_count(pgtable) != 1);
1598 mm_ops->put_page(pgtable);
1599 }
1600