Skip to content

Commit 126eb43

Browse files
Lai Jiangshanbysui
authored andcommitted
pvm: ABI: Convert dual CR3 to single CR3
PVM is a pagetable-based virtualization system where kernel and user space separation is implemented via separate page tables. Currently, a process’s address space is defined by the CR3 register and MSR_SWITCH_CR3 (a PVM virtual MSR). Address space switching is performed by the hypercall PVM_HC_LOAD_PGTBL, which loads both CR3 and MSR_SWITCH_CR3, and by swapping CR3 and MSR_SWITCH_CR3 when switching between kernel and user modes. However, this ABI deviates from native x86 architecture and should be converted to use a single CR3, as is standard. Using a single CR3 does not eliminate separation — the hypervisor manages two underlying shadow page tables to maintain proper kernel/user isolation. This change also removes MSR_PVM_SWITCH_CR3 and the user_pgd argument from PVM_HC_LOAD_PGTBL. Benefits of the current dual-CR3 design: - The guest explicitly manages which pages belong to kernel CR3 and which to user CR3, taking responsibility for proper separation. - It allows reuse of the existing Linux kernel KPTI (Kernel Page Table Isolation) logic inside the PVM guest — the main reason why the current dual-CR3 implementation is relatively simple. Drawbacks of dual-CR3: - It deviates from the native x86 architecture, making the ABI less clear. - Future kernels may remove KPTI once CPUs affected by the Meltdown bug are obsolete (possibly in 10–20 years), making this approach unsustainable long-term. - Wastes an extra 4 KB root page table per process in the guest. After adopting a single-CR3 model: Pros: - Clear, native x86-compliant ABI. Cons: - More complex logic required in the hypervisor to carefully manage shadow page tables that distinguish between kernel and user mappings. - The new implementation must go beyond simple KPTI and fully emulate native x86 behavior. Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com> Link: #19
1 parent ee0ba4b commit 126eb43

9 files changed

Lines changed: 78 additions & 45 deletions

File tree

arch/x86/Kconfig

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,6 @@ config KVM_GUEST
854854
config PVM_GUEST
855855
bool "PVM Guest support"
856856
depends on X86_64 && KVM_GUEST && X86_PIE && !KASAN
857-
select PAGE_TABLE_ISOLATION
858857
select PARAVIRT_XXL
859858
select RANDOMIZE_MEMORY
860859
select RELOCATABLE_UNCOMPRESSED_KERNEL

arch/x86/include/uapi/asm/pvm_para.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
#define MSR_PVM_LINEAR_ADDRESS_RANGE 0x4b564df0
3232
#define MSR_PVM_VCPU_STRUCT 0x4b564df1
33-
#define MSR_PVM_SWITCH_CR3 0x4b564df2
33+
// #define MSR_PVM_SWITCH_CR3 0x4b564df2 // deprecated, FIXME: reordering when sending v2
3434
// #define MSR_PVM_SUPERVISOR_RSP 0x4b564df3 // deprecated, FIXME: reordering when sending v2
3535
#define MSR_PVM_EVENT_ENTRY 0x4b564df4
3636
#define MSR_PVM_RETU_RIP 0x4b564df5

arch/x86/kernel/pvm.c

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -183,11 +183,6 @@ static unsigned long pvm_read_cr3(void)
183183
return this_cpu_read(pvm_guest_cr3);
184184
}
185185

186-
static unsigned long pvm_user_pgd(unsigned long pgd)
187-
{
188-
return pgd | BIT(PTI_PGTABLE_SWITCH_BIT) | BIT(X86_CR3_PTI_PCID_USER_BIT);
189-
}
190-
191186
static void pvm_write_cr3(unsigned long val)
192187
{
193188
/* Convert CR3_NO_FLUSH bit to hypercall flags. */
@@ -197,7 +192,7 @@ static void pvm_write_cr3(unsigned long val)
197192
if (pgtable_l5_enabled())
198193
flags |= PVM_LOAD_PGTBL_FLAGS_LA57;
199194
this_cpu_write(pvm_guest_cr3, pgd);
200-
pvm_hypercall3(PVM_HC_LOAD_PGTBL, flags, pgd, pvm_user_pgd(pgd));
195+
pvm_hypercall2(PVM_HC_LOAD_PGTBL, flags, pgd);
201196
}
202197

203198
static void pvm_flush_tlb_user(void)

arch/x86/kvm/mmu/mmu.c

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2466,6 +2466,18 @@ static void __link_shadow_page(struct kvm *kvm,
24662466

24672467
spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
24682468

2469+
/* for PVM, if the host has NX, force guest SMEP */
2470+
if (kvm->arch.host_mmu_root_pgd && cpu_feature_enabled(X86_FEATURE_NX)) {
2471+
struct kvm_mmu_page *parent = sptep_to_sp(sptep);
2472+
2473+
/*
2474+
* validate_pvm_indirect_access() enables user sp linked beneath
2475+
* kernel sp.
2476+
*/
2477+
if (!(parent->role.access & ACC_USER_MASK) && (sp->role.access & ACC_USER_MASK))
2478+
spte |= shadow_nx_mask;
2479+
}
2480+
24692481
mmu_spte_set(sptep, spte);
24702482

24712483
mmu_page_add_parent_pte(cache, sp, sptep);
@@ -2489,6 +2501,40 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
24892501
__link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
24902502
}
24912503

2504+
static unsigned validate_pvm_indirect_access(struct kvm_vcpu *vcpu, u64 *sptep,
2505+
unsigned access, unsigned leaf_access)
2506+
{
2507+
/*
2508+
* return directly when non-pvm or it is going to create user sp/spte
2509+
* which is allowed under both kernel and user sp
2510+
*/
2511+
if (!vcpu->kvm->arch.host_mmu_root_pgd || (leaf_access & ACC_USER_MASK))
2512+
return access;
2513+
2514+
access &= ~ACC_USER_MASK;
2515+
2516+
if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2517+
struct kvm_mmu_page *child;
2518+
2519+
/*
2520+
* For the pvm indirect sp, if the previous linked child
2521+
* is for user pagetable, no kernel sp/page should be
2522+
* mapped under the child, so the child should be updated
2523+
* if it is the case. It is not possible the case for
2524+
* current Linux PVM guest, but this check has to be examed
2525+
* for correctness.
2526+
*/
2527+
child = spte_to_child_sp(*sptep);
2528+
if (!(child->role.access & ACC_USER_MASK))
2529+
return access;
2530+
2531+
drop_parent_pte(vcpu->kvm, child, sptep);
2532+
kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
2533+
}
2534+
2535+
return access;
2536+
}
2537+
24922538
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
24932539
unsigned direct_access)
24942540
{
@@ -5282,9 +5328,13 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
52825328
root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
52835329

52845330
/* Shadow MMU level should be the same as host for PVM */
5285-
if (vcpu->kvm->arch.host_mmu_root_pgd && root_role.level != HOST_ROOT_LEVEL) {
5286-
root_role.level = HOST_ROOT_LEVEL;
5287-
root_role.passthrough = 1;
5331+
if (vcpu->kvm->arch.host_mmu_root_pgd) {
5332+
if (root_role.level != HOST_ROOT_LEVEL) {
5333+
root_role.level = HOST_ROOT_LEVEL;
5334+
root_role.passthrough = 1;
5335+
}
5336+
if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
5337+
root_role.access &= ~ACC_USER_MASK;
52885338
}
52895339

52905340
/*

arch/x86/kvm/mmu/paging_tmpl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
674674

675675
table_gfn = gw->table_gfn[it.level - 2];
676676
access = gw->pt_access[it.level - 2];
677+
access = validate_pvm_indirect_access(vcpu, it.sptep, access, direct_access);
677678
sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
678679
false, access);
679680

arch/x86/kvm/pvm/pvm.c

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -336,8 +336,8 @@ static inline void switch_to_smod(struct kvm_vcpu *vcpu)
336336
struct vcpu_pvm *pvm = to_pvm(vcpu);
337337

338338
pvm_switch_flags_toggle_mod(pvm);
339-
kvm_mmu_new_pgd(vcpu, pvm->msr_switch_cr3);
340-
swap(pvm->msr_switch_cr3, vcpu->arch.cr3);
339+
vcpu->arch.mmu->root_role.access &= ~ACC_USER_MASK;
340+
kvm_mmu_new_pgd(vcpu, vcpu->arch.cr3);
341341

342342
pvm_write_guest_gs_base(pvm, pvm->msr_kernel_gs_base);
343343

@@ -350,8 +350,8 @@ static inline void switch_to_umod(struct kvm_vcpu *vcpu)
350350
struct vcpu_pvm *pvm = to_pvm(vcpu);
351351

352352
pvm_switch_flags_toggle_mod(pvm);
353-
kvm_mmu_new_pgd(vcpu, pvm->msr_switch_cr3);
354-
swap(pvm->msr_switch_cr3, vcpu->arch.cr3);
353+
vcpu->arch.mmu->root_role.access |= ACC_USER_MASK;
354+
kvm_mmu_new_pgd(vcpu, vcpu->arch.cr3);
355355
}
356356

357357
/*
@@ -745,11 +745,12 @@ static void pvm_flush_hwtlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
745745
static u64 get_switch_hw_cr3(struct vcpu_pvm *pvm)
746746
{
747747
struct kvm_mmu *mmu = pvm->vcpu.arch.mmu;
748-
u64 cr3 = is_smod(pvm) ? pvm->vcpu.arch.cr3 : pvm->msr_switch_cr3;
748+
union kvm_mmu_page_role switch_role = mmu->root_role;
749749
int i;
750750

751+
switch_role.access ^= ACC_USER_MASK;
751752
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
752-
if (is_root_usable(&mmu->prev_roots[i], cr3, mmu->root_role)) {
753+
if (is_root_usable(&mmu->prev_roots[i], pvm->vcpu.arch.cr3, switch_role)) {
753754
if (i != 0)
754755
swap(mmu->prev_roots[0], mmu->prev_roots[i]);
755756
return mmu->prev_roots[0].hpa;
@@ -1086,9 +1087,6 @@ static int pvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
10861087
case MSR_PVM_RETS_RIP:
10871088
msr_info->data = pvm->msr_rets_rip_plus2 - 2;
10881089
break;
1089-
case MSR_PVM_SWITCH_CR3:
1090-
msr_info->data = pvm->msr_switch_cr3;
1091-
break;
10921090
case MSR_PVM_LINEAR_ADDRESS_RANGE:
10931091
msr_info->data = pvm->msr_linear_address_range;
10941092
break;
@@ -1239,9 +1237,6 @@ static int pvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
12391237
case MSR_PVM_RETS_RIP:
12401238
pvm->msr_rets_rip_plus2 = msr_info->data + 2;
12411239
break;
1242-
case MSR_PVM_SWITCH_CR3:
1243-
pvm->msr_switch_cr3 = msr_info->data;
1244-
break;
12451240
case MSR_PVM_LINEAR_ADDRESS_RANGE:
12461241
if (!pvm_check_and_set_msr_linear_address_range(pvm, msr_info->data))
12471242
return 1;
@@ -1848,29 +1843,26 @@ static int handle_hc_irq_halt(struct kvm_vcpu *vcpu)
18481843
return kvm_emulate_halt_noskip(vcpu);
18491844
}
18501845

1851-
static void pvm_flush_tlb_guest_current_kernel_user(struct kvm_vcpu *vcpu)
1846+
static void pvm_flush_tlb_guest_current(struct kvm_vcpu *vcpu)
18521847
{
18531848
/*
1854-
* sync the current pgd and user_pgd (pvm->msr_switch_cr3)
1855-
* which is a subset work of KVM_REQ_TLB_FLUSH_GUEST.
1849+
* sync the current pgd which is a subset work of KVM_REQ_TLB_FLUSH_GUEST.
18561850
*/
18571851
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
18581852
}
18591853

18601854
/*
18611855
* Hypercall: PVM_HC_LOAD_PGTBL
1862-
* Load two PGDs into the current CR3 and MSR_PVM_SWITCH_CR3.
1856+
* Load two PGDs into the current CR3.
18631857
*
18641858
* Arguments:
18651859
* flags: bit0: flush the TLBs tagged with @pgd and @user_pgd.
18661860
* bit1: 4 (bit1=0) or 5 (bit1=1 && cpuid_has(LA57)) level paging.
18671861
* pgd: to be loaded into CR3.
1868-
* user_pgd: to be loaded into MSR_PVM_SWITCH_CR3.
18691862
*/
18701863
static int handle_hc_load_pagetables(struct kvm_vcpu *vcpu, unsigned long flags,
1871-
unsigned long pgd, unsigned long user_pgd)
1864+
unsigned long pgd)
18721865
{
1873-
struct vcpu_pvm *pvm = to_pvm(vcpu);
18741866
unsigned long cr4 = vcpu->arch.cr4;
18751867

18761868
if (!(flags & PVM_LOAD_PGTBL_FLAGS_LA57))
@@ -1885,10 +1877,9 @@ static int handle_hc_load_pagetables(struct kvm_vcpu *vcpu, unsigned long flags,
18851877

18861878
kvm_mmu_new_pgd(vcpu, pgd);
18871879
vcpu->arch.cr3 = pgd;
1888-
pvm->msr_switch_cr3 = user_pgd;
18891880

18901881
if (flags & PVM_LOAD_PGTBL_FLAGS_TLB)
1891-
pvm_flush_tlb_guest_current_kernel_user(vcpu);
1882+
pvm_flush_tlb_guest_current(vcpu);
18921883

18931884
return 1;
18941885
}
@@ -1906,11 +1897,11 @@ static int handle_hc_flush_tlb_all(struct kvm_vcpu *vcpu)
19061897

19071898
/*
19081899
* Hypercall: PVM_HC_TLB_FLUSH_CURRENT
1909-
* Flush all TLBs tagged with the current CR3 and MSR_PVM_SWITCH_CR3.
1900+
* Flush all TLBs tagged with the current CR3.
19101901
*/
1911-
static int handle_hc_flush_tlb_current_kernel_user(struct kvm_vcpu *vcpu)
1902+
static int handle_hc_flush_tlb_current(struct kvm_vcpu *vcpu)
19121903
{
1913-
pvm_flush_tlb_guest_current_kernel_user(vcpu);
1904+
pvm_flush_tlb_guest_current(vcpu);
19141905

19151906
return 1;
19161907
}
@@ -2118,11 +2109,11 @@ static int handle_exit_syscall(struct kvm_vcpu *vcpu)
21182109
case PVM_HC_IRQ_HALT:
21192110
return handle_hc_irq_halt(vcpu);
21202111
case PVM_HC_LOAD_PGTBL:
2121-
return handle_hc_load_pagetables(vcpu, a0, a1, a2);
2112+
return handle_hc_load_pagetables(vcpu, a0, a1);
21222113
case PVM_HC_TLB_FLUSH:
21232114
return handle_hc_flush_tlb_all(vcpu);
21242115
case PVM_HC_TLB_FLUSH_CURRENT:
2125-
return handle_hc_flush_tlb_current_kernel_user(vcpu);
2116+
return handle_hc_flush_tlb_current(vcpu);
21262117
case PVM_HC_TLB_INVLPG:
21272118
return handle_hc_invlpg(vcpu, a0);
21282119
case PVM_HC_LOAD_GS:
@@ -2740,8 +2731,8 @@ static fastpath_t pvm_vcpu_run(struct kvm_vcpu *vcpu)
27402731
pvm_vcpu_run_noinstr(vcpu);
27412732

27422733
if (is_smod_befor_run != is_smod(pvm)) {
2734+
vcpu->arch.mmu->root_role.access ^= ACC_USER_MASK;
27432735
swap(pvm->vcpu.arch.mmu->root, pvm->vcpu.arch.mmu->prev_roots[0]);
2744-
swap(pvm->msr_switch_cr3, pvm->vcpu.arch.cr3);
27452736
}
27462737

27472738
/* MSR_IA32_DEBUGCTLMSR is zeroed before vmenter. Restore it if needed */
@@ -2863,7 +2854,6 @@ static void pvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
28632854
pvm->msr_event_entry = 0;
28642855
pvm->msr_retu_rip_plus2 = 0;
28652856
pvm->msr_rets_rip_plus2 = 0;
2866-
pvm->msr_switch_cr3 = 0;
28672857
pvm_set_default_msr_linear_address_range(pvm);
28682858
}
28692859

@@ -2991,7 +2981,8 @@ static __init void pvm_set_cpu_caps(void)
29912981
* PVM doesn't support SMEP. When NX is supported and the guest can
29922982
* use NX on the user pagetable to emulate the same protection as SMEP.
29932983
*/
2994-
kvm_cpu_cap_clear(X86_FEATURE_SMEP);
2984+
if (boot_cpu_has(X86_FEATURE_NX))
2985+
kvm_cpu_cap_set(X86_FEATURE_SMEP);
29952986

29962987
/*
29972988
* Unlike VMX/SVM which can switches paging mode atomically, PVM

arch/x86/kvm/pvm/pvm.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,6 @@ struct vcpu_pvm {
137137
unsigned long msr_event_entry;
138138
unsigned long msr_retu_rip_plus2;
139139
unsigned long msr_rets_rip_plus2;
140-
unsigned long msr_switch_cr3;
141140
unsigned long msr_linear_address_range;
142141

143142
u64 l4_range_start;

arch/x86/kvm/x86.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1528,7 +1528,6 @@ static const u32 emulated_msrs_all[] = {
15281528

15291529
MSR_PVM_LINEAR_ADDRESS_RANGE,
15301530
MSR_PVM_VCPU_STRUCT,
1531-
MSR_PVM_SWITCH_CR3,
15321531
MSR_PVM_EVENT_ENTRY,
15331532
MSR_PVM_RETU_RIP,
15341533
MSR_PVM_RETS_RIP,

arch/x86/mm/pti.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,8 @@ void __init pti_check_boottime_disable(void)
8585
}
8686

8787
if (boot_cpu_has(X86_FEATURE_KVM_PVM_GUEST)) {
88-
pti_mode = PTI_FORCE_ON;
89-
pti_print_if_insecure("force enabled on kvm pvm guest.");
90-
setup_force_cpu_cap(X86_FEATURE_PTI);
88+
pti_mode = PTI_FORCE_OFF;
89+
pti_print_if_insecure("disabled on PVM guest.");
9190
return;
9291
}
9392

0 commit comments

Comments
 (0)