diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index 1b731821889..08385d3095d 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -324,6 +324,11 @@ initializecpu(void) if ((r[0] & CPUID_HYBRID_CORE_MASK) == CPUID_HYBRID_SMALL_CORE) { PCPU_SET(small_core, 1); + if (pmap_pcid_enabled && + pmap_pcid_invlpg_workaround_uena) { + PCPU_SET(pcid_invlpg_workaround, 1); + pmap_pcid_invlpg_workaround = 1; + } } } } diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index f41e8dafcc8..5c60d301c1e 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -861,7 +861,7 @@ invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1) (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ - invlpg(smp_tlb_addr1); + pmap_invlpg(smp_tlb_pmap, smp_tlb_addr1); if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { @@ -931,10 +931,16 @@ invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1, #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; - do { - invlpg(addr); - addr += PAGE_SIZE; - } while (addr < smp_tlb_addr2); + if (smp_tlb_pmap == kernel_pmap && PCPU_GET(pcid_invlpg_workaround)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + do { + invlpg(addr); + addr += PAGE_SIZE; + } while (addr < smp_tlb_addr2); + } if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index a44993efb40..07a00963004 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -529,6 +529,12 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); +int pmap_pcid_invlpg_workaround = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, + CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pmap_pcid_invlpg_workaround, 0, + "Enable small core PCID/INVLPG workaround"); +int pmap_pcid_invlpg_workaround_uena = 1; int __read_frequently pti = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, @@ -2560,6 +2566,9 @@ pmap_init(void) VM_PAGE_TO_PHYS(m); } } + + TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround", + &pmap_pcid_invlpg_workaround_uena); } SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, @@ -2791,7 +2800,7 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ - invlpg(va); + pmap_invlpg(pmap, va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB @@ -3130,7 +3139,7 @@ pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, vm_offset_t addr2 __unused) { if (pmap == kernel_pmap) { - invlpg(va); + pmap_invlpg(kernel_pmap, va); } else if (pmap == PCPU_GET(curpmap)) { invlpg(va); pmap_invalidate_page_cb(pmap, va); @@ -3221,8 +3230,14 @@ pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) vm_offset_t addr; if (pmap == kernel_pmap) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); + if (PCPU_GET(pcid_invlpg_workaround)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + } } else if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); @@ -3760,7 +3775,7 @@ pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) for (; spa < epa; spa += PAGE_SIZE) { sched_pin(); pte_store(pte, spa | pte_bits); - invlpg(vaddr); + pmap_invlpg(kernel_pmap, vaddr); /* XXXKIB atomic inside flush_cache_range are excessive */ pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); sched_unpin(); @@ -7668,7 +7683,7 @@ pmap_kenter_temporary(vm_paddr_t pa, int i) va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); - invlpg(va); + pmap_invlpg(kernel_pmap, va); return ((void *)crashdumpmap); } @@ -10371,7 +10386,7 @@ pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); - invlpg(vaddr[i]); + pmap_invlpg(kernel_pmap, vaddr[i]); } } } @@ -10420,7 +10435,14 @@ pmap_quick_remove_page(vm_offset_t addr) if (addr != qframe) return; pte_store(vtopte(qframe), 0); + + /* + * Since qframe is exclusively mapped by + * pmap_quick_enter_page() and that function doesn't set PG_G, + * we can use INVLPG here. + */ invlpg(qframe); + mtx_unlock_spin(&qframe_mtx); } diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index 70f008fe835..c0c35f4419e 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -100,7 +100,8 @@ _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line"); u_int pc_smp_tlb_op; \ uint64_t pc_ucr3_load_mask; \ u_int pc_small_core; \ - char __pad[2912] /* pad to UMA_PCPU_ALLOC_SIZE */ + u_int pc_pcid_invlpg_workaround; \ + char __pad[2908] /* pad to UMA_PCPU_ALLOC_SIZE */ #define PC_DBREG_CMD_NONE 0 #define PC_DBREG_CMD_LOAD 1 diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index a55a14f94ed..e7497c2f8b4 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -431,6 +431,8 @@ extern vm_offset_t virtual_end; extern vm_paddr_t dmaplimit; extern int pmap_pcid_enabled; extern int invpcid_works; +extern int pmap_pcid_invlpg_workaround; +extern int pmap_pcid_invlpg_workaround_uena; #define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) #define pmap_page_is_write_mapped(m) (((m)->a.flags & PGA_WRITEABLE) != 0) @@ -514,6 +516,24 @@ pmap_invalidate_cpu_mask(pmap_t pmap) return (&pmap->pm_active); } +/* + * It seems that AlderLake+ small cores have some microarchitectural + * bug, which results in the INVLPG instruction failing to flush all + * global TLB entries when PCID is enabled. Work around it for now, + * by doing global invalidation on small cores instead of INVLPG. + */ +static __inline void +pmap_invlpg(pmap_t pmap, vm_offset_t va) +{ + if (pmap == kernel_pmap && PCPU_GET(pcid_invlpg_workaround)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + invlpg(va); + } +} + #endif /* _KERNEL */ /* Return various clipped indexes for a given VA */