diff --git a/share/man/man7/security.7 b/share/man/man7/security.7 index da2bce0159a..39f256eb3fb 100644 --- a/share/man/man7/security.7 +++ b/share/man/man7/security.7 @@ -28,7 +28,7 @@ .\" .\" $FreeBSD$ .\" -.Dd August 13, 2019 +.Dd November 12, 2019 .Dt SECURITY 7 .Os .Sh NAME @@ -1015,6 +1015,13 @@ hardware information leak. .It Dv hw.vmm.vmx.l1d_flush amd64. Controls the mitigation of L1 Terminal Fault in bhyve hypervisor. +.It Dv vm.pmap.allow_2m_x_ept +amd64. +Allows the use of superpages for executable mappings under the EPT +page table format used by hypervisors on Intel CPUs to map the guest +physical address space to machine physical memory. +May be disabled to work around a CPU Erratum called +Machine Check Error Avoidance on Page Size Change. .It Dv kern.elf32.aslr.enable Controls system-global Address Space Layout Randomization (ASLR) for normal non-PIE (Position Independent Executable) 32bit binaries. diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c index ae4c5216d26..841654562d7 100644 --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -1894,6 +1894,51 @@ pmap_page_init(vm_page_t m) m->md.pat_mode = PAT_WRITE_BACK; } +static int pmap_allow_2m_x_ept; +SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, + &pmap_allow_2m_x_ept, 0, + "Allow executable superpage mappings in EPT"); + +void +pmap_allow_2m_x_ept_recalculate(void) +{ + /* + * SKL002, SKL012S. Since the EPT format is only used by + * Intel CPUs, the vendor check is merely a formality. + */ + if (!(cpu_vendor_id != CPU_VENDOR_INTEL || + (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || + (CPUID_TO_FAMILY(cpu_id) == 0x6 && + (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ + CPUID_TO_MODEL(cpu_id) == 0x27 || + CPUID_TO_MODEL(cpu_id) == 0x35 || + CPUID_TO_MODEL(cpu_id) == 0x36 || + CPUID_TO_MODEL(cpu_id) == 0x37 || + CPUID_TO_MODEL(cpu_id) == 0x86 || + CPUID_TO_MODEL(cpu_id) == 0x1c || + CPUID_TO_MODEL(cpu_id) == 0x4a || + CPUID_TO_MODEL(cpu_id) == 0x4c || + CPUID_TO_MODEL(cpu_id) == 0x4d || + CPUID_TO_MODEL(cpu_id) == 0x5a || + CPUID_TO_MODEL(cpu_id) == 0x5c || + CPUID_TO_MODEL(cpu_id) == 0x5d || + CPUID_TO_MODEL(cpu_id) == 0x5f || + CPUID_TO_MODEL(cpu_id) == 0x6e || + CPUID_TO_MODEL(cpu_id) == 0x7a || + CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ + CPUID_TO_MODEL(cpu_id) == 0x85)))) + pmap_allow_2m_x_ept = 1; + TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); +} + +static bool +pmap_allow_2m_x_page(pmap_t pmap, bool executable) +{ + + return (pmap->pm_type != PT_EPT || !executable || + !pmap_allow_2m_x_ept); +} + #ifdef NUMA static void pmap_init_pv_table(void) @@ -2037,6 +2082,9 @@ pmap_init(void) } } + /* IFU */ + pmap_allow_2m_x_ept_recalculate(); + /* * Initialize the vm page array entries for the kernel pmap's * page table pages. @@ -5718,6 +5766,15 @@ retry: } #if VM_NRESERVLEVEL > 0 +static bool +pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) +{ + + if (pmap->pm_type != PT_EPT) + return (false); + return ((pde & EPT_PG_EXECUTE) != 0); +} + /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion @@ -5753,7 +5810,9 @@ pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); setpde: newpde = *firstpte; - if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { + if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V) || + !pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, + newpde))) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); @@ -6185,6 +6244,12 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, + newpde))) { + CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" + " in pmap %p", va, pmap); + return (KERN_FAILURE); + } if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" @@ -6331,6 +6396,7 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pmap_ps_enabled(pmap) && + pmap_allow_2m_x_page(pmap, (prot & VM_PROT_EXECUTE) != 0) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[NBPDR / PAGE_SIZE - 1]; else diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h index e0abe3d6516..303e2f2553f 100644 --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -422,6 +422,7 @@ struct thread; void pmap_activate_boot(pmap_t pmap); void pmap_activate_sw(struct thread *); +void pmap_allow_2m_x_ept_recalculate(void); void pmap_bootstrap(vm_paddr_t *); int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde); int pmap_change_attr(vm_offset_t, vm_size_t, int); diff --git a/sys/dev/cpuctl/cpuctl.c b/sys/dev/cpuctl/cpuctl.c index 7f6295692c0..ea0693717ba 100644 --- a/sys/dev/cpuctl/cpuctl.c +++ b/sys/dev/cpuctl/cpuctl.c @@ -50,6 +50,10 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include +#include + #include #include #include @@ -539,6 +543,7 @@ cpuctl_do_eval_cpu_features(int cpu, struct thread *td) hw_ssb_recalculate(true); #ifdef __amd64__ amd64_syscall_ret_flush_l1d_recalc(); + pmap_allow_2m_x_ept_recalculate(); #endif hw_mds_recalculate(); printcpuinfo(); diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h index 30fbd613ec8..d521701b2be 100644 --- a/sys/x86/include/specialreg.h +++ b/sys/x86/include/specialreg.h @@ -490,6 +490,7 @@ #define IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY 0x00000008 #define IA32_ARCH_CAP_SSB_NO 0x00000010 #define IA32_ARCH_CAP_MDS_NO 0x00000020 +#define IA32_ARCH_CAP_IF_PSCHANGE_MC_NO 0x00000040 /* * CPUID manufacturers identifiers