@@ -102,8 +102,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
102102 * @addr: IPA
103103 * @pmd: pmd pointer for IPA
104104 *
105- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
106- * pages in the range dirty.
105+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
107106 */
108107static void stage2_dissolve_pmd (struct kvm * kvm , phys_addr_t addr , pmd_t * pmd )
109108{
@@ -121,8 +120,7 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
121120 * @addr: IPA
122121 * @pud: pud pointer for IPA
123122 *
124- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all
125- * pages in the range dirty.
123+ * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
126124 */
127125static void stage2_dissolve_pud (struct kvm * kvm , phys_addr_t addr , pud_t * pudp )
128126{
@@ -899,9 +897,8 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
899897 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
900898 * @kvm: The KVM struct pointer for the VM.
901899 *
902- * Allocates only the stage-2 HW PGD level table(s) (can support either full
903- * 40-bit input addresses or limited to 32-bit input addresses). Clears the
904- * allocated pages.
900+ * Allocates only the stage-2 HW PGD level table(s) of size defined by
901+ * stage2_pgd_size(kvm).
905902 *
906903 * Note we don't need locking here as this is only called when the VM is
907904 * created, which can only be done once.
@@ -1067,25 +1064,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
10671064{
10681065 pmd_t * pmd , old_pmd ;
10691066
1067+ retry :
10701068 pmd = stage2_get_pmd (kvm , cache , addr );
10711069 VM_BUG_ON (!pmd );
10721070
10731071 old_pmd = * pmd ;
1072+ /*
1073+ * Multiple vcpus faulting on the same PMD entry, can
1074+ * lead to them sequentially updating the PMD with the
1075+ * same value. Following the break-before-make
1076+ * (pmd_clear() followed by tlb_flush()) process can
1077+ * hinder forward progress due to refaults generated
1078+ * on missing translations.
1079+ *
1080+ * Skip updating the page table if the entry is
1081+ * unchanged.
1082+ */
1083+ if (pmd_val (old_pmd ) == pmd_val (* new_pmd ))
1084+ return 0 ;
1085+
10741086 if (pmd_present (old_pmd )) {
10751087 /*
1076- * Multiple vcpus faulting on the same PMD entry, can
1077- * lead to them sequentially updating the PMD with the
1078- * same value. Following the break-before-make
1079- * (pmd_clear() followed by tlb_flush()) process can
1080- * hinder forward progress due to refaults generated
1081- * on missing translations.
1088+ * If we already have PTE level mapping for this block,
1089+ * we must unmap it to avoid inconsistent TLB state and
1090+ * leaking the table page. We could end up in this situation
1091+ * if the memory slot was marked for dirty logging and was
1092+ * reverted, leaving PTE level mappings for the pages accessed
1093+ * during the period. So, unmap the PTE level mapping for this
1094+ * block and retry, as we could have released the upper level
1095+ * table in the process.
10821096 *
1083- * Skip updating the page table if the entry is
1084- * unchanged .
1097+ * Normal THP split/merge follows mmu_notifier callbacks and do
1098+ * get handled accordingly .
10851099 */
1086- if (pmd_val (old_pmd ) == pmd_val (* new_pmd ))
1087- return 0 ;
1088-
1100+ if (!pmd_thp_or_huge (old_pmd )) {
1101+ unmap_stage2_range (kvm , addr & S2_PMD_MASK , S2_PMD_SIZE );
1102+ goto retry ;
1103+ }
10891104 /*
10901105 * Mapping in huge pages should only happen through a
10911106 * fault. If a page is merged into a transparent huge
@@ -1097,8 +1112,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
10971112 * should become splitting first, unmapped, merged,
10981113 * and mapped back in on-demand.
10991114 */
1100- VM_BUG_ON (pmd_pfn (old_pmd ) != pmd_pfn (* new_pmd ));
1101-
1115+ WARN_ON_ONCE (pmd_pfn (old_pmd ) != pmd_pfn (* new_pmd ));
11021116 pmd_clear (pmd );
11031117 kvm_tlb_flush_vmid_ipa (kvm , addr );
11041118 } else {
@@ -1114,21 +1128,31 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac
11141128{
11151129 pud_t * pudp , old_pud ;
11161130
1131+ retry :
11171132 pudp = stage2_get_pud (kvm , cache , addr );
11181133 VM_BUG_ON (!pudp );
11191134
11201135 old_pud = * pudp ;
11211136
11221137 /*
11231138 * A large number of vcpus faulting on the same stage 2 entry,
1124- * can lead to a refault due to the
1125- * stage2_pud_clear()/tlb_flush(). Skip updating the page
1126- * tables if there is no change.
1139+ * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
1140+ * Skip updating the page tables if there is no change.
11271141 */
11281142 if (pud_val (old_pud ) == pud_val (* new_pudp ))
11291143 return 0 ;
11301144
11311145 if (stage2_pud_present (kvm , old_pud )) {
1146+ /*
1147+ * If we already have table level mapping for this block, unmap
1148+ * the range for this block and retry.
1149+ */
1150+ if (!stage2_pud_huge (kvm , old_pud )) {
1151+ unmap_stage2_range (kvm , addr & S2_PUD_MASK , S2_PUD_SIZE );
1152+ goto retry ;
1153+ }
1154+
1155+ WARN_ON_ONCE (kvm_pud_pfn (old_pud ) != kvm_pud_pfn (* new_pudp ));
11321156 stage2_pud_clear (kvm , pudp );
11331157 kvm_tlb_flush_vmid_ipa (kvm , addr );
11341158 } else {
@@ -1451,13 +1475,11 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
14511475}
14521476
14531477/**
1454- * stage2_wp_puds - write protect PGD range
1455- * @pgd: pointer to pgd entry
1456- * @addr: range start address
1457- * @end: range end address
1458- *
1459- * Process PUD entries, for a huge PUD we cause a panic.
1460- */
1478+ * stage2_wp_puds - write protect PGD range
1479+ * @pgd: pointer to pgd entry
1480+ * @addr: range start address
1481+ * @end: range end address
1482+ */
14611483static void stage2_wp_puds (struct kvm * kvm , pgd_t * pgd ,
14621484 phys_addr_t addr , phys_addr_t end )
14631485{
@@ -1594,8 +1616,9 @@ static void kvm_send_hwpoison_signal(unsigned long address,
15941616 send_sig_mceerr (BUS_MCEERR_AR , (void __user * )address , lsb , current );
15951617}
15961618
1597- static bool fault_supports_stage2_pmd_mappings (struct kvm_memory_slot * memslot ,
1598- unsigned long hva )
1619+ static bool fault_supports_stage2_huge_mapping (struct kvm_memory_slot * memslot ,
1620+ unsigned long hva ,
1621+ unsigned long map_size )
15991622{
16001623 gpa_t gpa_start ;
16011624 hva_t uaddr_start , uaddr_end ;
@@ -1610,34 +1633,34 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
16101633
16111634 /*
16121635 * Pages belonging to memslots that don't have the same alignment
1613- * within a PMD for userspace and IPA cannot be mapped with stage-2
1614- * PMD entries, because we'll end up mapping the wrong pages.
1636+ * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
1637+ * PMD/PUD entries, because we'll end up mapping the wrong pages.
16151638 *
16161639 * Consider a layout like the following:
16171640 *
16181641 * memslot->userspace_addr:
16191642 * +-----+--------------------+--------------------+---+
1620- * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz|
1643+ * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
16211644 * +-----+--------------------+--------------------+---+
16221645 *
16231646 * memslot->base_gfn << PAGE_SIZE:
16241647 * +---+--------------------+--------------------+-----+
1625- * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz|
1648+ * |abc|def Stage-2 block | Stage-2 block |tvxyz|
16261649 * +---+--------------------+--------------------+-----+
16271650 *
1628- * If we create those stage-2 PMDs , we'll end up with this incorrect
1651+ * If we create those stage-2 blocks , we'll end up with this incorrect
16291652 * mapping:
16301653 * d -> f
16311654 * e -> g
16321655 * f -> h
16331656 */
1634- if ((gpa_start & ~ S2_PMD_MASK ) != (uaddr_start & ~ S2_PMD_MASK ))
1657+ if ((gpa_start & ( map_size - 1 )) != (uaddr_start & ( map_size - 1 ) ))
16351658 return false;
16361659
16371660 /*
16381661 * Next, let's make sure we're not trying to map anything not covered
1639- * by the memslot. This means we have to prohibit PMD size mappings
1640- * for the beginning and end of a non-PMD aligned and non-PMD sized
1662+ * by the memslot. This means we have to prohibit block size mappings
1663+ * for the beginning and end of a non-block aligned and non-block sized
16411664 * memory slot (illustrated by the head and tail parts of the
16421665 * userspace view above containing pages 'abcde' and 'xyz',
16431666 * respectively).
@@ -1646,8 +1669,8 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
16461669 * userspace_addr or the base_gfn, as both are equally aligned (per
16471670 * the check above) and equally sized.
16481671 */
1649- return (hva & S2_PMD_MASK ) >= uaddr_start &&
1650- (hva & S2_PMD_MASK ) + S2_PMD_SIZE <= uaddr_end ;
1672+ return (hva & ~( map_size - 1 ) ) >= uaddr_start &&
1673+ (hva & ~( map_size - 1 )) + map_size <= uaddr_end ;
16511674}
16521675
16531676static int user_mem_abort (struct kvm_vcpu * vcpu , phys_addr_t fault_ipa ,
@@ -1676,12 +1699,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16761699 return - EFAULT ;
16771700 }
16781701
1679- if (!fault_supports_stage2_pmd_mappings (memslot , hva ))
1680- force_pte = true;
1681-
1682- if (logging_active )
1683- force_pte = true;
1684-
16851702 /* Let's check if we will get back a huge page backed by hugetlbfs */
16861703 down_read (& current -> mm -> mmap_sem );
16871704 vma = find_vma_intersection (current -> mm , hva , hva + 1 );
@@ -1692,18 +1709,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16921709 }
16931710
16941711 vma_pagesize = vma_kernel_pagesize (vma );
1712+ if (logging_active ||
1713+ !fault_supports_stage2_huge_mapping (memslot , hva , vma_pagesize )) {
1714+ force_pte = true;
1715+ vma_pagesize = PAGE_SIZE ;
1716+ }
1717+
16951718 /*
16961719 * The stage2 has a minimum of 2 level table (For arm64 see
16971720 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
16981721 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
16991722 * As for PUD huge maps, we must make sure that we have at least
17001723 * 3 levels, i.e, PMD is not folded.
17011724 */
1702- if ((vma_pagesize == PMD_SIZE ||
1703- (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd (kvm ))) &&
1704- !force_pte ) {
1725+ if (vma_pagesize == PMD_SIZE ||
1726+ (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd (kvm )))
17051727 gfn = (fault_ipa & huge_page_mask (hstate_vma (vma ))) >> PAGE_SHIFT ;
1706- }
17071728 up_read (& current -> mm -> mmap_sem );
17081729
17091730 /* We need minimum second+third level pages */
0 commit comments