diff --git a/sys/arm/arm/genassym.c b/sys/arm/arm/genassym.c index 19c2ed3d3a2..3301ce156d5 100644 --- a/sys/arm/arm/genassym.c +++ b/sys/arm/arm/genassym.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -58,12 +59,19 @@ __FBSDID("$FreeBSD$"); ASSYM(KERNBASE, KERNBASE); ASSYM(PCB_NOALIGNFLT, PCB_NOALIGNFLT); +#ifdef ARM_NEW_PMAP +ASSYM(CPU_ASID_KERNEL,CPU_ASID_KERNEL); +#endif ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); +#ifndef ARM_NEW_PMAP ASSYM(PCB_DACR, offsetof(struct pcb, pcb_dacr)); +#endif ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_PAGEDIR, offsetof(struct pcb, pcb_pagedir)); +#ifndef ARM_NEW_PMAP ASSYM(PCB_L1VEC, offsetof(struct pcb, pcb_l1vec)); ASSYM(PCB_PL1VEC, offsetof(struct pcb, pcb_pl1vec)); +#endif ASSYM(PCB_R4, offsetof(struct pcb, pcb_regs.sf_r4)); ASSYM(PCB_R5, offsetof(struct pcb, pcb_regs.sf_r5)); ASSYM(PCB_R6, offsetof(struct pcb, pcb_regs.sf_r6)); @@ -131,7 +139,6 @@ ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); #endif ASSYM(PAGE_SIZE, PAGE_SIZE); -ASSYM(PDESIZE, PDESIZE); ASSYM(PMAP_DOMAIN_KERNEL, PMAP_DOMAIN_KERNEL); #ifdef PMAP_INCLUDE_PTE_SYNC ASSYM(PMAP_INCLUDE_PTE_SYNC, 1); @@ -145,8 +152,13 @@ ASSYM(TRAPFRAMESIZE, sizeof(struct trapframe)); ASSYM(MAXCOMLEN, MAXCOMLEN); ASSYM(MAXCPU, MAXCPU); +ASSYM(_NCPUWORDS, _NCPUWORDS); ASSYM(NIRQ, NIRQ); ASSYM(PCPU_SIZE, sizeof(struct pcpu)); +ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); +ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); +ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); +ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(DCACHE_LINE_SIZE, offsetof(struct cpuinfo, dcache_line_size)); ASSYM(DCACHE_LINE_MASK, offsetof(struct cpuinfo, dcache_line_mask)); diff --git a/sys/arm/arm/machdep.c b/sys/arm/arm/machdep.c index 4bd5dc11416..a8f2dd67ff3 100644 --- a/sys/arm/arm/machdep.c +++ b/sys/arm/arm/machdep.c @@ -138,6 +138,14 @@ int _min_bzero_size = 0; extern int *end; #ifdef FDT +vm_paddr_t pmap_pa; + +#ifdef ARM_NEW_PMAP +vm_offset_t systempage; +vm_offset_t irqstack; +vm_offset_t undstack; +vm_offset_t abtstack; +#else /* * This is the number of L2 page tables required for covering max * (hypothetical) memsize of 4GB and all kernel mappings (vectors, msgbuf, @@ -147,15 +155,13 @@ extern int *end; static struct pv_addr kernel_pt_table[KERNEL_PT_MAX]; -vm_paddr_t pmap_pa; - struct pv_addr systempage; static struct pv_addr msgbufpv; struct pv_addr irqstack; struct pv_addr undstack; struct pv_addr abtstack; static struct pv_addr kernelstack; - +#endif #endif #if defined(LINUX_BOOT_ABI) @@ -381,9 +387,11 @@ cpu_startup(void *dummy) vm_pager_bufferinit(); pcb->pcb_regs.sf_sp = (u_int)thread0.td_kstack + USPACE_SVC_STACK_TOP; - vector_page_setprot(VM_PROT_READ); pmap_set_pcb_pagedir(pmap_kernel(), pcb); +#ifndef ARM_NEW_PMAP + vector_page_setprot(VM_PROT_READ); pmap_postinit(); +#endif #ifdef ARM_TP_ADDRESS #ifdef ARM_CACHE_LOCK_ENABLE pmap_kenter_user(ARM_TP_ADDRESS, ARM_TP_ADDRESS); @@ -1003,6 +1011,19 @@ init_proc0(vm_offset_t kstack) pcpup->pc_curpcb = thread0.td_pcb; } +#ifdef ARM_NEW_PMAP +void +set_stackptrs(int cpu) +{ + + set_stackptr(PSR_IRQ32_MODE, + irqstack + ((IRQ_STACK_SIZE * PAGE_SIZE) * (cpu + 1))); + set_stackptr(PSR_ABT32_MODE, + abtstack + ((ABT_STACK_SIZE * PAGE_SIZE) * (cpu + 1))); + set_stackptr(PSR_UND32_MODE, + undstack + ((UND_STACK_SIZE * PAGE_SIZE) * (cpu + 1))); +} +#else void set_stackptrs(int cpu) { @@ -1014,6 +1035,7 @@ set_stackptrs(int cpu) set_stackptr(PSR_UND32_MODE, undstack.pv_va + ((UND_STACK_SIZE * PAGE_SIZE) * (cpu + 1))); } +#endif #ifdef FDT static char * @@ -1048,6 +1070,7 @@ print_kenv(void) debugf(" %x %s\n", (uint32_t)cp, cp); } +#ifndef ARM_NEW_PMAP void * initarm(struct arm_boot_params *abp) { @@ -1316,4 +1339,181 @@ initarm(struct arm_boot_params *abp) return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP - sizeof(struct pcb))); } +#else /* !ARM_NEW_PMAP */ +void * +initarm(struct arm_boot_params *abp) +{ + struct mem_region mem_regions[FDT_MEM_REGIONS]; + vm_paddr_t lastaddr; + vm_offset_t dtbp, kernelstack, dpcpu; + uint32_t memsize; + char *env; + void *kmdp; + int err_devmap, mem_regions_sz; + + /* get last allocated physical address */ + arm_physmem_kernaddr = abp->abp_physaddr; + lastaddr = parse_boot_param(abp) - KERNVIRTADDR + arm_physmem_kernaddr; + + memsize = 0; + set_cpufuncs(); + cpuinfo_init(); + + /* + * Find the dtb passed in by the boot loader. + */ + kmdp = preload_search_by_type("elf kernel"); + if (kmdp != NULL) + dtbp = MD_FETCH(kmdp, MODINFOMD_DTBP, vm_offset_t); + else + dtbp = (vm_offset_t)NULL; +#if defined(FDT_DTB_STATIC) + /* + * In case the device tree blob was not retrieved (from metadata) try + * to use the statically embedded one. + */ + if (dtbp == (vm_offset_t)NULL) + dtbp = (vm_offset_t)&fdt_static_dtb; #endif + + if (OF_install(OFW_FDT, 0) == FALSE) + panic("Cannot install FDT"); + + if (OF_init((void *)dtbp) != 0) + panic("OF_init failed with the found device tree"); + + /* Grab physical memory regions information from device tree. */ + if (fdt_get_mem_regions(mem_regions, &mem_regions_sz, &memsize) != 0) + panic("Cannot get physical memory regions"); + arm_physmem_hardware_regions(mem_regions, mem_regions_sz); + + /* Grab reserved memory regions information from device tree. */ + if (fdt_get_reserved_regions(mem_regions, &mem_regions_sz) == 0) + arm_physmem_exclude_regions(mem_regions, mem_regions_sz, + EXFLAG_NODUMP | EXFLAG_NOALLOC); + + /* + * Set TEX remapping registers. + * Setup kernel page tables and switch to kernel L1 page table. + */ + pmap_set_tex(); + pmap_bootstrap_prepare(lastaddr); + + /* + * Now that proper page tables are installed, call cpu_setup() to enable + * instruction and data caches and other chip-specific features. + */ + cpu_setup(""); + + /* Platform-specific initialisation */ + platform_probe_and_attach(); + pcpu0_init(); + + /* Do basic tuning, hz etc */ + init_param1(); + + /* + * Allocate a page for the system page mapped to 0xffff0000 + * This page will just contain the system vectors and can be + * shared by all processes. + */ + systempage = pmap_preboot_get_pages(1); + + /* Map the vector page. */ + pmap_preboot_map_pages(systempage, ARM_VECTORS_HIGH, 1); + if (virtual_end >= ARM_VECTORS_HIGH) + virtual_end = ARM_VECTORS_HIGH - 1; + + /* Allocate dynamic per-cpu area. */ + dpcpu = pmap_preboot_get_vpages(DPCPU_SIZE / PAGE_SIZE); + dpcpu_init((void *)dpcpu, 0); + + /* Allocate stacks for all modes */ + irqstack = pmap_preboot_get_vpages(IRQ_STACK_SIZE * MAXCPU); + abtstack = pmap_preboot_get_vpages(ABT_STACK_SIZE * MAXCPU); + undstack = pmap_preboot_get_vpages(UND_STACK_SIZE * MAXCPU ); + kernelstack = pmap_preboot_get_vpages(KSTACK_PAGES * MAXCPU); + + /* Allocate message buffer. */ + msgbufp = (void *)pmap_preboot_get_vpages( + round_page(msgbufsize) / PAGE_SIZE); + + /* + * Pages were allocated during the secondary bootstrap for the + * stacks for different CPU modes. + * We must now set the r13 registers in the different CPU modes to + * point to these stacks. + * Since the ARM stacks use STMFD etc. we must set r13 to the top end + * of the stack memory. + */ + set_stackptrs(0); + mutex_init(); + + /* Establish static device mappings. */ + err_devmap = platform_devmap_init(); + arm_devmap_bootstrap(0, NULL); + vm_max_kernel_address = platform_lastaddr(); + + /* + * Only after the SOC registers block is mapped we can perform device + * tree fixups, as they may attempt to read parameters from hardware. + */ + OF_interpret("perform-fixup", 0); + platform_gpio_init(); + cninit(); + + debugf("initarm: console initialized\n"); + debugf(" arg1 kmdp = 0x%08x\n", (uint32_t)kmdp); + debugf(" boothowto = 0x%08x\n", boothowto); + debugf(" dtbp = 0x%08x\n", (uint32_t)dtbp); + debugf(" lastaddr1: 0x%08x\n", lastaddr); + print_kenv(); + + env = kern_getenv("kernelname"); + if (env != NULL) + strlcpy(kernelname, env, sizeof(kernelname)); + + if (err_devmap != 0) + printf("WARNING: could not fully configure devmap, error=%d\n", + err_devmap); + + platform_late_init(); + + /* + * We must now clean the cache again.... + * Cleaning may be done by reading new data to displace any + * dirty data in the cache. This will have happened in setttb() + * but since we are boot strapping the addresses used for the read + * may have just been remapped and thus the cache could be out + * of sync. A re-clean after the switch will cure this. + * After booting there are no gross relocations of the kernel thus + * this problem will not occur after initarm(). + */ + /* Set stack for exception handlers */ + undefined_init(); + init_proc0(kernelstack); + arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL); + enable_interrupts(PSR_A); + pmap_bootstrap(0); + + /* Exclude the kernel (and all the things we allocated which immediately + * follow the kernel) from the VM allocation pool but not from crash + * dumps. virtual_avail is a global variable which tracks the kva we've + * "allocated" while setting up pmaps. + * + * Prepare the list of physical memory available to the vm subsystem. + */ + arm_physmem_exclude_region(abp->abp_physaddr, + pmap_preboot_get_pages(0) - abp->abp_physaddr, EXFLAG_NOALLOC); + arm_physmem_init_kernel_globals(); + + init_param2(physmem); + /* Init message buffer. */ + msgbufinit(msgbufp, msgbufsize); + kdb_init(); + return ((void *)STACKALIGN(thread0.td_pcb)); + +} + +#endif /* !ARM_NEW_PMAP */ +#endif /* FDT */ diff --git a/sys/arm/arm/mem.c b/sys/arm/arm/mem.c index 58b0d25a52d..2e4128b5519 100644 --- a/sys/arm/arm/mem.c +++ b/sys/arm/arm/mem.c @@ -113,6 +113,9 @@ memrw(struct cdev *dev, struct uio *uio, int flags) return (EINVAL); sx_xlock(&tmppt_lock); pmap_kenter((vm_offset_t)_tmppt, v); +#ifdef ARM_NEW_PMAP + pmap_tlb_flush(kernel_pmap, (vm_offset_t)_tmppt); +#endif o = (int)uio->uio_offset & PAGE_MASK; c = (u_int)(PAGE_SIZE - ((int)iov->iov_base & PAGE_MASK)); c = min(c, (u_int)(PAGE_SIZE - o)); diff --git a/sys/arm/arm/minidump_machdep.c b/sys/arm/arm/minidump_machdep.c index c12aefa8b50..71e732ed929 100644 --- a/sys/arm/arm/minidump_machdep.c +++ b/sys/arm/arm/minidump_machdep.c @@ -61,7 +61,10 @@ CTASSERT(sizeof(struct kerneldumpheader) == 512); uint32_t *vm_page_dump; int vm_page_dump_size; +#ifndef ARM_NEW_PMAP + static struct kerneldumpheader kdh; + static off_t dumplo; /* Handle chunked writes. */ @@ -473,8 +476,20 @@ fail: else printf("\n** DUMP FAILED (ERROR %d) **\n", error); return (error); + return (0); } +#else /* ARM_NEW_PMAP */ + +int +minidumpsys(struct dumperinfo *di) +{ + + return (0); +} + +#endif + void dump_add_page(vm_paddr_t pa) { diff --git a/sys/arm/arm/mp_machdep.c b/sys/arm/arm/mp_machdep.c index 6f278bbd4ef..fff34c41a69 100644 --- a/sys/arm/arm/mp_machdep.c +++ b/sys/arm/arm/mp_machdep.c @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -151,10 +152,20 @@ init_secondary(int cpu) uint32_t loop_counter; int start = 0, end = 0; +#ifdef ARM_NEW_PMAP + pmap_set_tex(); + reinit_mmu(pmap_kern_ttb, (1<<6) | (1<< 0), (1<<6) | (1<< 0)); + cpu_setup(""); + + /* Provide stack pointers for other processor modes. */ + set_stackptrs(cpu); + + enable_interrupts(PSR_A); +#else /* ARM_NEW_PMAP */ cpu_setup(NULL); setttb(pmap_pa); cpu_tlb_flushID(); - +#endif /* ARM_NEW_PMAP */ pc = &__pcpu[cpu]; /* @@ -166,10 +177,10 @@ init_secondary(int cpu) pcpu_init(pc, cpu, sizeof(struct pcpu)); dpcpu_init(dpcpu[cpu - 1], cpu); - +#ifndef ARM_NEW_PMAP /* Provide stack pointers for other processor modes. */ set_stackptrs(cpu); - +#endif /* Signal our startup to BSP */ atomic_add_rel_32(&mp_naps, 1); @@ -298,6 +309,12 @@ ipi_handler(void *arg) CTR1(KTR_SMP, "%s: IPI_TLB", __func__); cpufuncs.cf_tlb_flushID(); break; +#ifdef ARM_NEW_PMAP + case IPI_LAZYPMAP: + CTR1(KTR_SMP, "%s: IPI_LAZYPMAP", __func__); + pmap_lazyfix_action(); + break; +#endif default: panic("Unknown IPI 0x%0x on cpu %d", ipi, curcpu); } diff --git a/sys/arm/arm/pmap-v6-new.c b/sys/arm/arm/pmap-v6-new.c new file mode 100644 index 00000000000..8c159181aae --- /dev/null +++ b/sys/arm/arm/pmap-v6-new.c @@ -0,0 +1,6723 @@ +/*- + * Copyright (c) 1991 Regents of the University of California. + * Copyright (c) 1994 John S. Dyson + * Copyright (c) 1994 David Greenman + * Copyright (c) 2005-2010 Alan L. Cox + * Copyright (c) 2014 Svatopluk Kraus + * Copyright (c) 2014 Michal Meloun + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 + */ +/*- + * Copyright (c) 2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jake Burkholder, + * Safeport Network Services, and Network Associates Laboratories, the + * Security Research Division of Network Associates, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA + * CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * Manages physical address maps. + * + * Since the information managed by this module is + * also stored by the logical address mapping module, + * this module may throw away valid virtual-to-physical + * mappings at almost any time. However, invalidations + * of virtual-to-physical mappings must be done as + * requested. + * + * In order to cope with hardware architectures which + * make virtual-to-physical map invalidates expensive, + * this module may delay invalidate or reduced protection + * operations until such time as they are actually + * necessary. This module is given full information as + * to which processors are currently using which maps, + * and to when physical maps must be made correct. + */ + +#include "opt_vm.h" +#include "opt_pmap.h" +#include "opt_ddb.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef SMP +#include +#else +#include +#endif + +#ifdef DDB +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef SMP +#include +#endif + +#ifndef PMAP_SHPGPERPROC +#define PMAP_SHPGPERPROC 200 +#endif + +#ifndef DIAGNOSTIC +#define PMAP_INLINE __inline +#else +#define PMAP_INLINE +#endif + +#ifdef PMAP_DEBUG +static void pmap_zero_page_check(vm_page_t m); +void pmap_debug(int level); +int pmap_pid_dump(int pid); +void pmap_pvdump(vm_paddr_t pa); + +#define PDEBUG(_lev_,_stat_) \ + if (pmap_debug_level >= (_lev_)) \ + ((_stat_)) +#define dprintf printf +int pmap_debug_level = 1; +#else /* PMAP_DEBUG */ +#define PDEBUG(_lev_,_stat_) /* Nothing */ +#define dprintf(x, arg...) +#endif /* PMAP_DEBUG */ + +/* + * Level 2 page tables map definion ('max' is excluded). + */ + +#define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) +#define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) + +#define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) +#define UPT2V_MAX_ADDRESS \ + ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) + +/* + * Promotion to a 1MB (PTE1) page mapping requires that the corresponding + * 4KB (PTE2) page mappings have identical settings for the following fields: + */ +#define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ + PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ + PTE2_ATTR_MASK) + +#define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ + PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ + PTE1_ATTR_MASK) + +#define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ + (((l2_attr) & L2_C) ? L1_S_C : 0) | \ + (((l2_attr) & L2_B) ? L1_S_B : 0) | \ + (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ + (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ + (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ + (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ + (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ + (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ + (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ + (((l2_attr) & PTE2_W) ? PTE1_W : 0)) + +#define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ + (((l1_attr) & L1_S_C) ? L2_C : 0) | \ + (((l1_attr) & L1_S_B) ? L2_B : 0) | \ + (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ + (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ + (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ + (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ + (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ + (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ + (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ + (((l1_attr) & PTE1_W) ? PTE2_W : 0)) + +/* + * PTE2 descriptors creation macros. + */ +#define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, pt_memattr) +#define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, pt_memattr) + +#define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_NORMAL) +#define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_NORMAL) + +#define PV_STATS +#ifdef PV_STATS +#define PV_STAT(x) do { x ; } while (0) +#else +#define PV_STAT(x) do { } while (0) +#endif + +/* + * The boot_pt1 is used temporary in very early boot stage as L1 page table. + * We can init many things with no memory allocation thanks to its static + * allocation and this brings two main advantages: + * (1) other cores can be started very simply, + * (2) various boot loaders can be supported as its arguments can be processed + * in virtual address space and can be moved to safe location before + * first allocation happened. + * Only disadvantage is that boot_pt1 is used only in very early boot stage. + * However, the table is uninitialized and so lays in bss. Therefore kernel + * image size is not influenced. + * + * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and + * CPU suspend/resume game. + */ +extern pt1_entry_t boot_pt1[]; + +vm_paddr_t base_pt1; +pt1_entry_t *kern_pt1; +pt2_entry_t *kern_pt2tab; +pt2_entry_t *PT2MAP; + +static uint32_t ttb_flags; +static vm_memattr_t pt_memattr; +ttb_entry_t pmap_kern_ttb; + +/* XXX use converion function*/ +#define PTE2_ATTR_NORMAL VM_MEMATTR_DEFAULT +#define PTE1_ATTR_NORMAL ATTR_TO_L1(PTE2_ATTR_NORMAL) + +struct pmap kernel_pmap_store; +LIST_HEAD(pmaplist, pmap); +static struct pmaplist allpmaps; +static struct mtx allpmaps_lock; + +vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ +vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ + +static vm_offset_t kernel_vm_end_new; +vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; +vm_offset_t vm_max_kernel_address; +vm_paddr_t kernel_l1pa; + +static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; + +/* + * Data for the pv entry allocation mechanism + */ +static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); +static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; +static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ +static int shpgperproc = PMAP_SHPGPERPROC; + +struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ +int pv_maxchunks; /* How many chunks we have KVA for */ +vm_offset_t pv_vafree; /* freelist stored in the PTE */ + +vm_paddr_t first_managed_pa; +#define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) + +/* + * All those kernel PT submaps that BSD is so fond of + */ +struct sysmaps { + struct mtx lock; + pt2_entry_t *CMAP1; + pt2_entry_t *CMAP2; + pt2_entry_t *CMAP3; + caddr_t CADDR1; + caddr_t CADDR2; + caddr_t CADDR3; +}; +static struct sysmaps sysmaps_pcpu[MAXCPU]; +static pt2_entry_t *CMAP3; +static caddr_t CADDR3; +caddr_t _tmppt = 0; + +struct msgbuf *msgbufp = 0; /* XXX move it to machdep.c */ + +/* + * Crashdump maps. + */ +static caddr_t crashdumpmap; + +static pt2_entry_t *PMAP1 = 0, *PMAP2; +static pt2_entry_t *PADDR1 = 0, *PADDR2; +#ifdef DDB +static pt2_entry_t *PMAP3; +static pt2_entry_t *PADDR3; +static int PMAP3cpu __unused; /* for SMP only */ +#endif +#ifdef SMP +static int PMAP1cpu; +static int PMAP1changedcpu; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, + &PMAP1changedcpu, 0, + "Number of times pmap_pte2_quick changed CPU with same PMAP1"); +#endif +static int PMAP1changed; +SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, + &PMAP1changed, 0, + "Number of times pmap_pte2_quick changed PMAP1"); +static int PMAP1unchanged; +SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, + &PMAP1unchanged, 0, + "Number of times pmap_pte2_quick didn't change PMAP1"); +static struct mtx PMAP2mutex; + +static __inline void pt2_wirecount_init(vm_page_t m); +static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, + vm_offset_t va); +void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); + +/* + * Function to set the debug level of the pmap code. + */ +#ifdef PMAP_DEBUG +void +pmap_debug(int level) +{ + + pmap_debug_level = level; + dprintf("pmap_debug: level=%d\n", pmap_debug_level); +} +#endif /* PMAP_DEBUG */ + +/* + * This table must corespond with memory attribute configuration in vm.h. + * First entry is used for normal system mapping. + * + * Device memory is always marked as shared. + * Normal memory is shared only in SMP . + * Not outer shareable bits are not used yet. + * Class 6 cannot be used on ARM11. + */ +#define TEXDEF_TYPE_SHIFT 0 +#define TEXDEF_TYPE_MASK 0x3 +#define TEXDEF_INNER_SHIFT 2 +#define TEXDEF_INNER_MASK 0x3 +#define TEXDEF_OUTER_SHIFT 4 +#define TEXDEF_OUTER_MASK 0x3 +#define TEXDEF_NOS_SHIFT 6 +#define TEXDEF_NOS_MASK 0x1 + +#define TEX(t, i, o, s) \ + ((t) << TEXDEF_TYPE_SHIFT) | \ + ((i) << TEXDEF_INNER_SHIFT) | \ + ((o) << TEXDEF_OUTER_SHIFT | \ + ((s) << TEXDEF_NOS_SHIFT)) + +static uint32_t tex_class[8] = { +/* type inner cache outer cache */ + TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ + TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ + TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ + TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ + TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 4 - NOT USED YET */ + TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ + TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ + TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ +}; +#undef TEX + +/* + * Convert TEX definition entry to TTB flags. + */ +static uint32_t +encode_ttb_flags(int idx) +{ + uint32_t inner, outer, nos, reg; + + inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & + TEXDEF_INNER_MASK; + outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & + TEXDEF_OUTER_MASK; + nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & + TEXDEF_NOS_MASK; + + reg = nos << 5; + reg |= outer << 3; + if (cpuinfo.coherent_walk) + reg |= (inner & 0x1) << 6; + reg |= (inner & 0x2) >> 1; +#ifdef SMP + reg |= 1 << 1; +#endif + return reg; +} + +/* + * Set TEX remapping registers in current CPU. + */ +void +pmap_set_tex(void) +{ + uint32_t prrr, nmrr; + uint32_t type, inner, outer, nos; + int i; + +#ifdef PMAP_PTE_NOCACHE + /* XXX fixme */ + if (cpuinfo.coherent_walk) { + pt_memattr = VM_MEMATTR_WB_WA; + ttb_flags = encode_ttb_flags(0); + } + else { + pt_memattr = VM_MEMATTR_NOCACHE; + ttb_flags = encode_ttb_flags(1); + } +#else + pt_memattr = VM_MEMATTR_WB_WA; + ttb_flags = encode_ttb_flags(0); +#endif + + prrr = 0; + nmrr = 0; + + /* Build remapping register from TEX classes. */ + for (i = 0; i < 8; i++) { + type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & + TEXDEF_TYPE_MASK; + inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & + TEXDEF_INNER_MASK; + outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & + TEXDEF_OUTER_MASK; + nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & + TEXDEF_NOS_MASK; + + prrr |= type << (i * 2); + prrr |= nos << (i + 24); + nmrr |= inner << (i * 2); + nmrr |= outer << (i * 2 + 16); + } + /* Add shareable bits for device memory. */ + prrr |= PRRR_DS0 | PRRR_DS1; + + /* Add shareable bits for normal memory in SMP case. */ +#ifdef SMP + prrr |= PRRR_NS1; +#endif + cp15_prrr_set(prrr); + cp15_nmrr_set(nmrr); + + /* Caches are disabled, so full TLB flush should be enough. */ + tlb_flush_all_local(); +} + +/* + * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, + * KERNBASE is mapped by first L2 page table in L2 page table page. It + * meets same constrain due to PT2MAP being placed just under KERNBASE. + */ +CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); +CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); + +/* + * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. + * For now, anyhow, the following check must be fulfilled. + */ +CTASSERT(PAGE_SIZE == PTE2_SIZE); +/* + * We don't want to mess up MI code with all MMU and PMAP definitions, + * so some things, which depend on other ones, are defined independently. + * Now, it is time to check that we don't screw up something. + */ +CTASSERT(PDRSHIFT == PTE1_SHIFT); +/* + * Check L1 and L2 page table entries definitions consistency. + */ +CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); +CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); +/* + * Check L2 page tables page consistency. + */ +CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); +CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); +/* + * Check PT2TAB consistency. + * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. + * This should be done without remainder. + */ +CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); + +/* + * A PT2MAP magic. + * + * All level 2 page tables (PT2s) are mapped continuously and accordingly + * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can + * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page + * must be used together, but not necessary at once. The first PT2 in a page + * must map things on correctly aligned address and the others must follow + * in right order. + */ +#define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) +#define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) +#define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) + +/* + * Check PT2TAB consistency. + * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. + * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. + * The both should be done without remainder. + */ +CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); +CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); +/* + * The implementation was made general, however, with the assumption + * bellow in mind. In case of another value of NPG_IN_PT2TAB, + * the code should be once more rechecked. + */ +CTASSERT(NPG_IN_PT2TAB == 1); + +/* + * Get offset of PT2 in a page + * associated with given PT1 index. + */ +static __inline u_int +page_pt2off(u_int pt1_idx) +{ + + return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); +} + +/* + * Get physical address of PT2 + * associated with given PT2s page and PT1 index. + */ +static __inline vm_paddr_t +page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) +{ + + return (pgpa + page_pt2off(pt1_idx)); +} + +/* + * Get first entry of PT2 + * associated with given PT2s page and PT1 index. + */ +static __inline pt2_entry_t * +page_pt2(vm_offset_t pgva, u_int pt1_idx) +{ + + return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); +} + +/* + * Get virtual address of PT2s page (mapped in PT2MAP) + * which holds PT2 which holds entry which maps given virtual address. + */ +static __inline vm_offset_t +pt2map_pt2pg(vm_offset_t va) +{ + + va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); + return ((vm_offset_t)pt2map_entry(va)); +} + +/***************************************************************************** + * + * THREE pmap initialization milestones exist: + * + * locore.S + * -> fundamental init (including MMU) in ASM + * + * initarm() + * -> fundamental init continues in C + * -> first available physical address is known + * + * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) + * -> basic (safe) interface for physical address allocation is made + * -> basic (safe) interface for virtual mapping is made + * -> limited not SMP coherent work is possible + * + * -> more fundamental init continues in C + * -> locks and some more things are available + * -> all fundamental allocations and mappings are done + * + * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) + * -> phys_avail[] and virtual_avail is set + * -> control is passed to vm subsystem + * -> physical and virtual address allocation are off limit + * -> low level mapping functions, some SMP coherent, + * are available, which cannot be used before vm subsystem + * is being inited + * + * mi_startup() + * -> vm subsystem is being inited + * + * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) + * -> pmap is fully inited + * + *****************************************************************************/ + +/***************************************************************************** + * + * PMAP first stage initialization and utility functions + * for pre-bootstrap epoch. + * + * After pmap_bootstrap_prepare() is called, the following functions + * can be used: + * + * (1) strictly only for this stage functions for physical page allocations, + * virtual space allocations, and mappings: + * + * vm_paddr_t pmap_preboot_get_pages(u_int num); + * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); + * vm_offset_t pmap_preboot_reserve_pages(u_int num); + * vm_offset_t pmap_preboot_get_vpages(u_int num); + * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, + * int prot, int attr); + * + * (2) for all stages: + * + * vm_paddr_t pmap_kextract(vm_offset_t va); + * + * NOTE: This is not SMP coherent stage. + * + *****************************************************************************/ + +#define KERNEL_P2V(pa) \ + ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) +#define KERNEL_V2P(va) \ + ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) + +static vm_paddr_t last_paddr; + +/* + * Pre-bootstrap epoch page allocator. + */ +vm_paddr_t +pmap_preboot_get_pages(u_int num) +{ + vm_paddr_t ret; + + ret = last_paddr; + last_paddr += num * PAGE_SIZE; + + return (ret); +} + +/* + * The fundamental initalization of PMAP stuff. + * + * Some things already happened in locore.S and some things could happen + * before pmap_bootstrap_prepare() is called, so let's recall what is done: + * 1. Caches are disabled. + * 2. We are running on virtual addresses already with 'boot_pt1' + * as L1 page table. + * 3. So far, all virtual addresses can be converted to physical ones and + * vice versa by the following macros: + * KERNEL_P2V(pa) .... physical to virtual ones, + * KERNEL_V2P(va) .... virtual to physical ones. + * + * What is done herein: + * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. + * 2. PT2MAP magic is brought to live. + * 3. Basic preboot functions for page allocations and mappings can be used. + * 4. Everything is prepared for L1 cache enabling. + * + * Variations: + * 1. To use second TTB register, so kernel and users page tables will be + * separated. This way process forking - pmap_pinit() - could be faster, + * it saves physical pages and KVA per a process, and it's simple change. + * However, it will lead, due to hardware matter, to the following: + * (a) 2G space for kernel and 2G space for users. + * (b) 1G space for kernel in low addresses and 3G for users above it. + * A question is: Is the case (b) really an option? Note that case (b) + * does save neither physical memory and KVA. + */ +void +pmap_bootstrap_prepare(vm_paddr_t last) +{ + vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; + vm_offset_t pt2pg_va; + pt1_entry_t *pte1p; + pt2_entry_t *pte2p; + u_int i; + + /* + * Now, we are going to make real kernel mapping. Note that we are + * already running on some mapping made in locore.S and we expect + * that it's large enough to ensure nofault access to physical memory + * allocated herein before switch. + * + * As kernel image and everything needed before are and will be mapped + * by section mappings, we align last physical address to PTE1_SIZE. + */ + last_paddr = pte1_roundup(last); + + /* + * Allocate and zero page(s) for kernel L1 page table. + * + * Note that it's first allocation on space which was PTE1_SIZE + * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. + */ + base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); + kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); + bzero((void*)kern_pt1, NB_IN_PT1); + pte1_sync_range(kern_pt1, NB_IN_PT1); + + /* Allocate and zero page(s) for kernel PT2TAB. */ + pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); + kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); + bzero(kern_pt2tab, NB_IN_PT2TAB); + pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); + + /* Allocate and zero page(s) for kernel L2 page tables. */ + pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); + pt2pg_va = KERNEL_P2V(pt2pg_pa); + size = NKPT2PG * PAGE_SIZE; + bzero((void*)pt2pg_va, size); + pte2_sync_range((pt2_entry_t *)pt2pg_va, size); + + /* + * Add a physical memory segment (vm_phys_seg) corresponding to the + * preallocated pages for kernel L2 page tables so that vm_page + * structures representing these pages will be created. The vm_page + * structures are required for promotion of the corresponding kernel + * virtual addresses to section mappings. + */ + vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); + + /* + * Insert allocated L2 page table pages to PT2TAB and make + * link to all PT2s in L1 page table. See how kernel_vm_end + * is initialized. + * + * We play simple and safe. So every KVA will have underlaying + * L2 page table, even kernel image mapped by sections. + */ + pte2p = kern_pt2tab_entry(KERNBASE); + for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) + pt2tab_store(pte2p++, PTE2_KPT(pa)); + + pte1p = kern_pte1(KERNBASE); + for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) + pte1_store(pte1p++, PTE1_LINK(pa)); + + /* Make section mappings for kernel. */ + pte1p = kern_pte1(KERNBASE); + for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) + pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, + ATTR_TO_L1(PTE2_ATTR_WB_WA))); + + /* + * Get free and aligned space for PT2MAP and make L1 page table links + * to L2 page tables held in PT2TAB. + * + * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t + * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus + * each entry in PT2TAB maps all PT2s in a page. This implies that + * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. + */ + PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); + pte1p = kern_pte1((vm_offset_t)PT2MAP); + for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { + pte1_store(pte1p++, PTE1_LINK(pa)); + } + + /* + * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. + * Each pmap will hold own PT2TAB, so the mapping should be not global. + */ + pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); + for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { + pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); + } + + /* + * Choose correct L2 page table and make mappings for allocations + * made herein which replaces temporary locore.S mappings after a while. + * Note that PT2MAP cannot be used until we switch to kern_pt1. + * + * Note, that these allocations started aligned on 1M section and + * kernel PT1 was allocated first. Making of mappings must follow + * order of physical allocations as we've used KERNEL_P2V() macro + * for virtual addresses resolution. + */ + pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); + pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); + + pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); + + /* Make mapping for kernel L1 page table. */ + for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) + pte2_store(pte2p++, PTE2_KPT(pa)); + + /* Make mapping for kernel PT2TAB. */ + for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) + pte2_store(pte2p++, PTE2_KPT(pa)); + + /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ + pmap_kern_ttb = base_pt1 | ttb_flags; + reinit_mmu(pmap_kern_ttb, (1 << 6) | (1 << 0), (1 << 6) | (1 << 0)); + + /* + * Initialize the first available KVA. As kernel image is mapped by + * sections, we are leaving some gap behind. + */ + virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; +} + +/* + * Setup L2 page table page for given KVA. + * Used in pre-bootstrap epoch. + * + * Note that we have allocated NKPT2PG pages for L2 page tables in advance + * and used them for mapping KVA starting from KERNBASE. However, this is not + * enough. Vectors and devices need L2 page tables too. Note that they are + * even above VM_MAX_KERNEL_ADDRESS. + */ +static __inline vm_paddr_t +pmap_preboot_pt2pg_setup(vm_offset_t va) +{ + pt2_entry_t *pte2p, pte2; + vm_paddr_t pt2pg_pa; + + /* Get associated entry in PT2TAB. */ + pte2p = kern_pt2tab_entry(va); + + /* Just return, if PT2s page exists already. */ + pte2 = pt2tab_load(pte2p); + if (pte2_is_valid(pte2)) + return (pte2_pa(pte2)); + + KASSERT(va >= VM_MAX_KERNEL_ADDRESS, + ("%s: NKPT2PG too small", __func__)); + + /* + * Allocate page for PT2s and insert it to PT2TAB. + * In other words, map it into PT2MAP space. + */ + pt2pg_pa = pmap_preboot_get_pages(1); + pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); + + /* Zero all PT2s in allocated page. */ + bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); + pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); + + return (pt2pg_pa); +} + +/* + * Setup L2 page table for given KVA. + * Used in pre-bootstrap epoch. + */ +static void +pmap_preboot_pt2_setup(vm_offset_t va) +{ + pt1_entry_t *pte1p; + vm_paddr_t pt2pg_pa, pt2_pa; + + /* Setup PT2's page. */ + pt2pg_pa = pmap_preboot_pt2pg_setup(va); + pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); + + /* Insert PT2 to PT1. */ + pte1p = kern_pte1(va); + pte1_store(pte1p, PTE1_LINK(pt2_pa)); +} + +/* + * Get L2 page entry associated with given KVA. + * Used in pre-bootstrap epoch. + */ +static __inline pt2_entry_t* +pmap_preboot_vtopte2(vm_offset_t va) +{ + pt1_entry_t *pte1p; + + /* Setup PT2 if needed. */ + pte1p = kern_pte1(va); + if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ + pmap_preboot_pt2_setup(va); + + return (pt2map_entry(va)); +} + +/* + * Pre-bootstrap epoch page(s) mapping(s). + */ +void +pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) +{ + u_int i; + pt2_entry_t *pte2p; + + /* Map all the pages. */ + for (i = 0; i < num; i++) { + pte2p = pmap_preboot_vtopte2(va); + pte2_store(pte2p, PTE2_KRW(pa)); + va += PAGE_SIZE; + pa += PAGE_SIZE; + } +} + +/* + * Pre-bootstrap epoch virtual space alocator. + */ +vm_offset_t +pmap_preboot_reserve_pages(u_int num) +{ + u_int i; + vm_offset_t start, va; + pt2_entry_t *pte2p; + + /* Allocate virtual space. */ + start = va = virtual_avail; + virtual_avail += num * PAGE_SIZE; + + /* Zero the mapping. */ + for (i = 0; i < num; i++) { + pte2p = pmap_preboot_vtopte2(va); + pte2_store(pte2p, 0); + va += PAGE_SIZE; + } + + return (start); +} + +/* + * Pre-bootstrap epoch page(s) allocation and mapping(s). + */ +vm_offset_t +pmap_preboot_get_vpages(u_int num) +{ + vm_paddr_t pa; + vm_offset_t va; + + /* Allocate physical page(s). */ + pa = pmap_preboot_get_pages(num); + + /* Allocate virtual space. */ + va = virtual_avail; + virtual_avail += num * PAGE_SIZE; + + /* Map and zero all. */ + pmap_preboot_map_pages(pa, va, num); + bzero((void *)va, num * PAGE_SIZE); + + return (va); +} + +/* + * Pre-bootstrap epoch page mapping(s) with attributes. + */ +void +pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, int prot, + int attr) +{ + u_int num; + u_int l1_attr, l1_prot; + pt1_entry_t *pte1p; + pt2_entry_t *pte2p; + + l1_prot = ATTR_TO_L1(prot); + l1_attr = ATTR_TO_L1(attr); + + /* Map all the pages. */ + num = round_page(size); + while (num > 0) { + if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { + pte1p = kern_pte1(va); + pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); + va += PTE1_SIZE; + pa += PTE1_SIZE; + num -= PTE1_SIZE; + } else { + pte2p = pmap_preboot_vtopte2(va); + pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); + va += PAGE_SIZE; + pa += PAGE_SIZE; + num -= PAGE_SIZE; + } + } + +} + +/* + * Extract from the kernel page table the physical address + * that is mapped by the given virtual address "va". + */ +vm_paddr_t +pmap_kextract(vm_offset_t va) +{ + vm_paddr_t pa; + pt1_entry_t pte1; + pt2_entry_t pte2; + + pte1 = pte1_load(kern_pte1(va)); + if (pte1_is_section(pte1)) { + pa = pte1_pa(pte1) | (va & PTE1_OFFSET); + } else if (pte1_is_link(pte1)) { + /* + * We should beware of concurrent promotion that changes + * pte1 at this point. However, it's not a problem as PT2 + * page is preserved by promotion in PT2TAB. So even if + * it happens, using of PT2MAP is still safe. + * + * QQQ: However, concurrent removing is a problem which + * ends in abort on PT2MAP space. Locking must be used + * to deal with this. + */ + pte2 = pte2_load(pt2map_entry(va)); + pa = pte2_pa(pte2) | (va & PTE2_OFFSET); + } + else { + panic("%s: va %#x pte1 %#x", __func__, va, pte1); + } + return (pa); +} + +/***************************************************************************** + * + * PMAP second stage initialization and utility functions + * for bootstrap epoch. + * + * After pmap_bootstrap() is called, the following functions for + * mappings can be used: + * + * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); + * void pmap_kremove(vm_offset_t va); + * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, + * int prot); + * + * NOTE: This is not SMP coherent stage. And physical page allocation is not + * allowed during this stage. + * + *****************************************************************************/ + +/* + * Initialize kernel PMAP locks and lists, kernel_pmap itself, and + * reserve various virtual spaces for temporary mappings. + */ +void +pmap_bootstrap(vm_offset_t firstaddr) +{ + pt2_entry_t *unused __unused; + struct sysmaps *sysmaps; + u_int i; + + /* + * Initialize the kernel pmap (which is statically allocated). + */ + PMAP_LOCK_INIT(kernel_pmap); + kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ + kernel_pmap->pm_pt1 = kern_pt1; + kernel_pmap->pm_pt2tab = kern_pt2tab; + CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ + TAILQ_INIT(&kernel_pmap->pm_pvchunk); + + /* + * Initialize the global pv list lock. + */ + rw_init(&pvh_global_lock, "pmap pv global"); + + LIST_INIT(&allpmaps); + + /* + * Request a spin mutex so that changes to allpmaps cannot be + * preempted by smp_rendezvous_cpus(). + */ + mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); + mtx_lock_spin(&allpmaps_lock); + LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + + /* + * Reserve some special page table entries/VA space for temporary + * mapping of pages. + */ +#define SYSMAP(c, p, v, n) do { \ + v = (c)pmap_preboot_reserve_pages(1); \ + p = pt2map_entry((vm_offset_t)v); \ + } while (0) + + /* + * Local CMAP1/CMAP2 are used for zeroing and copying pages. + * Local CMAP3 is used for data cache cleaning. + * Global CMAP3 is used for the idle process page zeroing. + */ + for (i = 0; i < MAXCPU; i++) { + sysmaps = &sysmaps_pcpu[i]; + mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); + SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1); + SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1); + SYSMAP(caddr_t, sysmaps->CMAP3, sysmaps->CADDR3, 1); + } + SYSMAP(caddr_t, CMAP3, CADDR3, 1); + + /* + * Crashdump maps. + */ + SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); + + /* + * _tmppt is used for reading arbitrary physical pages via /dev/mem. + */ + SYSMAP(caddr_t, unused, _tmppt, 1); + + /* + * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), + * respectively. PADDR3 is used by pmap_pte2_ddb(). + */ + SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); + SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); +#ifdef DDB + SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); +#endif + mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); + + /* + * Note that in very short time in initarm(), we are going to + * initialize phys_avail[] array and no futher page allocation + * can happen after that until vm subsystem will be initialized. + */ + kernel_vm_end_new = kernel_vm_end; + virtual_end = vm_max_kernel_address; +} + +/* + * The function can already be use in second initialization stage. + * As such, the function DOES NOT call pmap_growkernel() where PT2 + * allocation can happen. So if used, be sure that PT2 for given + * virtual address is allocated already! + * + * Add a wired page to the kva. + * Note: not SMP coherent. + */ +static __inline void +pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, + uint32_t attr) +{ + pt1_entry_t *pte1p; + pt2_entry_t *pte2p; + + pte1p = kern_pte1(va); + if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ + /* + * This is a very low level function, so PT2 and particularly + * PT2PG associated with given virtual address must be already + * allocated. It's a pain mainly during pmap initialization + * stage. However, called after pmap initialization with + * virtual address not under kernel_vm_end will lead to + * the same misery. + */ + if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) + panic("%s: kernel PT2 not allocated!", __func__); + } + + pte2p = pt2map_entry(va); + pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); +} + +static __inline void +pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int attr) +{ + + pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, attr); +} + +PMAP_INLINE void +pmap_kenter(vm_offset_t va, vm_paddr_t pa) +{ + + pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_NORMAL); +} + +/* + * Remove a page from the kernel pagetables. + * Note: not SMP coherent. + */ +PMAP_INLINE void +pmap_kremove(vm_offset_t va) +{ + pt2_entry_t *pte2p; + + pte2p = pt2map_entry(va); + pte2_clear(pte2p); +} + +/* + * Share new kernel PT2PG with all pmaps. + * The caller is responsible for maintaining TLB consistency. + */ +static void +pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) +{ + pmap_t pmap; + pt2_entry_t *pte2p; + + mtx_lock_spin(&allpmaps_lock); + LIST_FOREACH(pmap, &allpmaps, pm_list) { + pte2p = pmap_pt2tab_entry(pmap, va); + pt2tab_store(pte2p, npte2); + } + mtx_unlock_spin(&allpmaps_lock); +} + +/* + * Share new kernel PTE1 with all pmaps. + * The caller is responsible for maintaining TLB consistency. + */ +static void +pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) +{ + pmap_t pmap; + pt1_entry_t *pte1p; + + mtx_lock_spin(&allpmaps_lock); + LIST_FOREACH(pmap, &allpmaps, pm_list) { + pte1p = pmap_pte1(pmap, va); + pte1_store(pte1p, npte1); + } + mtx_unlock_spin(&allpmaps_lock); +} + +/* + * Used to map a range of physical addresses into kernel + * virtual address space. + * + * The value passed in '*virt' is a suggested virtual address for + * the mapping. Architectures which can support a direct-mapped + * physical to virtual region can return the appropriate address + * within that region, leaving '*virt' unchanged. Other + * architectures should map the pages starting at '*virt' and + * update '*virt' with the first usable address after the mapped + * region. + * + * NOTE: Read the comments above pmap_kenter_prot_attr() as + * the function is used herein! + */ +vm_offset_t +pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) +{ + vm_offset_t va, sva; + vm_paddr_t pte1_offset; + pt1_entry_t npte1; + u_int l1prot,l2prot; + + PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," + " prot = %d\n", __func__, *virt, start, end, end - start, prot)); + + l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE1_AP_KR; + l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; + l1prot = ATTR_TO_L1(l2prot); + + va = *virt; + /* + * Does the physical address range's size and alignment permit at + * least one section mapping to be created? + */ + pte1_offset = start & PTE1_OFFSET; + if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= + PTE1_SIZE) { + /* + * Increase the starting virtual address so that its alignment + * does not preclude the use of section mappings. + */ + if ((va & PTE1_OFFSET) < pte1_offset) + va = pte1_trunc(va) + pte1_offset; + else if ((va & PTE1_OFFSET) > pte1_offset) + va = pte1_roundup(va) + pte1_offset; + } + sva = va; + while (start < end) { + if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { + KASSERT((va & PTE1_OFFSET) == 0, + ("%s: misaligned va %#x", __func__, va)); + npte1 = PTE1_KERN(start, l1prot, PTE1_ATTR_NORMAL); + pmap_kenter_pte1(va, npte1); + va += PTE1_SIZE; + start += PTE1_SIZE; + } else { + pmap_kenter_prot_attr(va, start, l2prot, + PTE2_ATTR_NORMAL); + va += PAGE_SIZE; + start += PAGE_SIZE; + } + } + tlb_flush_range(sva, va - sva); + *virt = va; + return (sva); +} + +/* + * Make a temporary mapping for a physical address. + * This is only intended to be used for panic dumps. + */ +void * +pmap_kenter_temporary(vm_paddr_t pa, int i) +{ + vm_offset_t va; + + /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ + + va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); + pmap_kenter(va, pa); + tlb_flush_local(va); + return ((void *)crashdumpmap); +} + + +/************************************* + * + * TLB & cache maintenance routines. + * + *************************************/ + +/* + * We inline these within pmap.c for speed. + */ +PMAP_INLINE void +pmap_tlb_flush(pmap_t pmap, vm_offset_t va) +{ + + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + tlb_flush(va); +} + +PMAP_INLINE void +pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) +{ + + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + tlb_flush_range(sva, size); +} + +PMAP_INLINE void +pmap_tlb_flush_ng(pmap_t pmap) +{ + + if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + tlb_flush_all_ng(); +} + +/* + * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. + * Requirements: + * - Must deal with pages in order to ensure that none of the PTE2_* bits + * are ever set, PTE2_V in particular. + * - Assumes we can write to pte2s without pte2_store() atomic ops. + * - Assumes nothing will ever test these addresses for 0 to indicate + * no mapping instead of correctly checking PTE2_V. + * - Assumes a vm_offset_t will fit in a pte2 (true for arm). + * Because PTE2_V is never set, there can be no mappings to invalidate. + */ +static vm_offset_t +pmap_pte2list_alloc(vm_offset_t *head) +{ + pt2_entry_t *pte2p; + vm_offset_t va; + + va = *head; + if (va == 0) + panic("pmap_ptelist_alloc: exhausted ptelist KVA"); + pte2p = pt2map_entry(va); + *head = *pte2p; + if (*head & PTE2_V) + panic("%s: va with PTE2_V set!", __func__); + *pte2p = 0; + return (va); +} + +static void +pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) +{ + pt2_entry_t *pte2p; + + if (va & PTE2_V) + panic("%s: freeing va with PTE2_V set!", __func__); + pte2p = pt2map_entry(va); + *pte2p = *head; /* virtual! PTE2_V is 0 though */ + *head = va; +} + +static void +pmap_pte2list_init(vm_offset_t *head, void *base, int npages) +{ + int i; + vm_offset_t va; + + *head = 0; + for (i = npages - 1; i >= 0; i--) { + va = (vm_offset_t)base + i * PAGE_SIZE; + pmap_pte2list_free(head, va); + } +} + +/***************************************************************************** + * + * PMAP third and final stage initialization. + * + * After pmap_init() is called, PMAP subsystem is fully initialized. + * + *****************************************************************************/ + +SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); + +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, + "Max number of PV entries"); +SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, + "Page share factor per proc"); + +static u_long nkpt2pg = NKPT2PG; +SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, + &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); + +static int sp_enabled = 1; +SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &sp_enabled, 0, "Are large page mappings enabled?"); + +static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD, 0, + "1MB page mapping counters"); + +static u_long pmap_pte1_demotions; +SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, + &pmap_pte1_demotions, 0, "1MB page demotions"); + +static u_long pmap_pte1_mappings; +SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, + &pmap_pte1_mappings, 0, "1MB page mappings"); + +static u_long pmap_pte1_p_failures; +SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, + &pmap_pte1_p_failures, 0, "1MB page promotion failures"); + +static u_long pmap_pte1_promotions; +SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, + &pmap_pte1_promotions, 0, "1MB page promotions"); + +static __inline ttb_entry_t +pmap_ttb_get(pmap_t pmap) +{ + + return (vtophys(pmap->pm_pt1) | ttb_flags); +} + +/* + * Initialize a vm_page's machine-dependent fields. + * + * Variations: + * 1. Pages for L2 page tables are always not managed. So, pv_list and + * pt2_wirecount can share same physical space. However, proper + * initialization on a page alloc for page tables and reinitialization + * on the page free must be ensured. + */ +void +pmap_page_init(vm_page_t m) +{ + + TAILQ_INIT(&m->md.pv_list); + pt2_wirecount_init(m); + m->md.pat_mode = PTE2_ATTR_NORMAL; +} + +/* + * Virtualization for faster way how to zero whole page. + */ +static __inline void +pagezero(void *page) +{ + + bzero(page, PAGE_SIZE); +} + +/* + * Zero L2 page table page. + * Use same KVA as in pmap_zero_page(). + */ +static __inline vm_paddr_t +pmap_pt2pg_zero(vm_page_t m) +{ + vm_paddr_t pa; + struct sysmaps *sysmaps; + + pa = VM_PAGE_TO_PHYS(m); + + /* + * XXX: For now, we map whole page even if it's already zero, + * to sync it even if the sync is only DSB. + */ + sched_pin(); + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (pte2_load(sysmaps->CMAP2) != 0) + panic("%s: CMAP2 busy", __func__); + pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(pa, PTE2_AP_KRW, + m->md.pat_mode)); + tlb_flush_local((vm_offset_t)sysmaps->CADDR2); + /* Even VM_ALLOC_ZERO request is only advisory. */ + if ((m->flags & PG_ZERO) == 0) + pagezero(sysmaps->CADDR2); + pte2_sync_range((pt2_entry_t *)sysmaps->CADDR2, PAGE_SIZE); + pte2_clear(sysmaps->CMAP2); + sched_unpin(); + mtx_unlock(&sysmaps->lock); + + return (pa); +} + +/* + * Init just allocated page as L2 page table(s) holder + * and return its physical address. + */ +static __inline vm_paddr_t +pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + vm_paddr_t pa; + pt2_entry_t *pte2p; + + /* Check page attributes. */ + if (pmap_page_get_memattr(m) != pt_memattr) + pmap_page_set_memattr(m, pt_memattr); + + /* Zero page and init wire counts. */ + pa = pmap_pt2pg_zero(m); + pt2_wirecount_init(m); + + /* + * Map page to PT2MAP address space for given pmap. + * Note that PT2MAP space is shared with all pmaps. + */ + if (pmap == kernel_pmap) + pmap_kenter_pt2tab(va, PTE2_KPT(pa)); + else { + pte2p = pmap_pt2tab_entry(pmap, va); + pt2tab_store(pte2p, PTE2_KPT_NG(pa)); + } + + return (pa); +} + +/* + * Initialize the pmap module. + * Called by vm_init, to initialize any structures that the pmap + * system needs to map virtual memory. + */ +void +pmap_init(void) +{ + vm_size_t s; + pt2_entry_t *pte2p, pte2; + u_int i, pte1_idx, pv_npg; + + PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); + + /* + * Initialize the vm page array entries for kernel pmap's + * L2 page table pages allocated in advance. + */ + pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); + pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); + for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { + vm_paddr_t pa; + vm_page_t m; + + pte2 = pte2_load(pte2p); + KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); + + pa = pte2_pa(pte2); + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m >= vm_page_array && + m < &vm_page_array[vm_page_array_size], + ("%s: L2 page table page is out of range", __func__)); + + m->pindex = pte1_idx; + m->phys_addr = pa; + pte1_idx += NPT2_IN_PG; + } + + /* + * Initialize the address space (zone) for the pv entries. Set a + * high water mark so that the system can recover from excessive + * numbers of pv entries. + */ + TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); + pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; + TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); + pv_entry_max = roundup(pv_entry_max, _NPCPV); + pv_entry_high_water = 9 * (pv_entry_max / 10); + + /* + * Are large page mappings enabled? + */ + TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); + if (sp_enabled) { + KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, + ("%s: can't assign to pagesizes[1]", __func__)); + pagesizes[1] = PTE1_SIZE; + } + + /* + * Calculate the size of the pv head table for sections. + * Handle the possibility that "vm_phys_segs[...].end" is zero. + * Note that the table is only for sections which could be promoted. + */ + first_managed_pa = pte1_trunc(vm_phys_segs[0].start); + pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) + - first_managed_pa) / PTE1_SIZE + 1; + + /* + * Allocate memory for the pv head table for sections. + */ + s = (vm_size_t)(pv_npg * sizeof(struct md_page)); + s = round_page(s); + pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, + M_WAITOK | M_ZERO); + for (i = 0; i < pv_npg; i++) + TAILQ_INIT(&pv_table[i].pv_list); + + pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); + pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); + if (pv_chunkbase == NULL) + panic("%s: not enough kvm for pv chunks", __func__); + pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); +} + +/* + * Add a list of wired pages to the kva + * this routine is only used for temporary + * kernel mappings that do not need to have + * page modification or references recorded. + * Note that old mappings are simply written + * over. The page *must* be wired. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) +{ + u_int anychanged; + pt2_entry_t *epte2p, *pte2p, pte2; + vm_page_t m; + vm_paddr_t pa; + + anychanged = 0; + pte2p = pt2map_entry(sva); + epte2p = pte2p + count; + while (pte2p < epte2p) { + m = *ma++; + pa = VM_PAGE_TO_PHYS(m); + pte2 = pte2_load(pte2p); + if ((pte2_pa(pte2) != pa) || + (pte2_attr(pte2) != m->md.pat_mode)) { + anychanged++; + pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, + m->md.pat_mode)); + } + pte2p++; + } + if (__predict_false(anychanged)) + tlb_flush_range(sva, count * PAGE_SIZE); +} + +/* + * This routine tears out page mappings from the + * kernel -- it is meant only for temporary mappings. + * Note: SMP coherent. Uses a ranged shootdown IPI. + */ +void +pmap_qremove(vm_offset_t sva, int count) +{ + vm_offset_t va; + + va = sva; + while (count-- > 0) { + pmap_kremove(va); + va += PAGE_SIZE; + } + tlb_flush_range(sva, va - sva); +} + +/* + * Are we current address space or kernel? + */ +static __inline int +pmap_is_current(pmap_t pmap) +{ + + return (pmap == kernel_pmap || + (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); +} + +/* + * If the given pmap is not the current or kernel pmap, the returned + * pte2 must be released by passing it to pmap_pte2_release(). + */ +static pt2_entry_t * +pmap_pte2(pmap_t pmap, vm_offset_t va) +{ + pt1_entry_t pte1; + vm_paddr_t pt2pg_pa; + + pte1 = pte1_load(pmap_pte1(pmap, va)); + if (pte1_is_section(pte1)) + panic("%s: attempt to map PTE1", __func__); + if (pte1_is_link(pte1)) { + /* Are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (pt2map_entry(va)); + /* Note that L2 page table size is not equal to PAGE_SIZE. */ + pt2pg_pa = trunc_page(pte1_link_pa(pte1)); + mtx_lock(&PMAP2mutex); + if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { + pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); + tlb_flush((vm_offset_t)PADDR2); + } + return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); + } + return (NULL); +} + +/* + * Releases a pte2 that was obtained from pmap_pte2(). + * Be prepared for the pte2p being NULL. + */ +static __inline void +pmap_pte2_release(pt2_entry_t *pte2p) +{ + + if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { + mtx_unlock(&PMAP2mutex); + } +} + +/* + * Super fast pmap_pte2 routine best used when scanning + * the pv lists. This eliminates many coarse-grained + * invltlb calls. Note that many of the pv list + * scans are across different pmaps. It is very wasteful + * to do an entire tlb flush for checking a single mapping. + * + * If the given pmap is not the current pmap, pvh_global_lock + * must be held and curthread pinned to a CPU. + */ +static pt2_entry_t * +pmap_pte2_quick(pmap_t pmap, vm_offset_t va) +{ + pt1_entry_t pte1; + vm_paddr_t pt2pg_pa; + + pte1 = pte1_load(pmap_pte1(pmap, va)); + if (pte1_is_section(pte1)) + panic("%s: attempt to map PTE1", __func__); + if (pte1_is_link(pte1)) { + /* Are we current address space or kernel? */ + if (pmap_is_current(pmap)) + return (pt2map_entry(va)); + rw_assert(&pvh_global_lock, RA_WLOCKED); + KASSERT(curthread->td_pinned > 0, + ("%s: curthread not pinned", __func__)); + /* Note that L2 page table size is not equal to PAGE_SIZE. */ + pt2pg_pa = trunc_page(pte1_link_pa(pte1)); + if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { + pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); +#ifdef SMP + PMAP1cpu = PCPU_GET(cpuid); +#endif + tlb_flush_local((vm_offset_t)PADDR1); + PMAP1changed++; + } else +#ifdef SMP + if (PMAP1cpu != PCPU_GET(cpuid)) { + PMAP1cpu = PCPU_GET(cpuid); + tlb_flush_local((vm_offset_t)PADDR1); + PMAP1changedcpu++; + } else +#endif + PMAP1unchanged++; + return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); + } + return (NULL); +} + +/* + * Routine: pmap_extract + * Function: + * Extract the physical page address associated + * with the given map/virtual_address pair. + */ +vm_paddr_t +pmap_extract(pmap_t pmap, vm_offset_t va) +{ + vm_paddr_t pa; + pt1_entry_t pte1; + pt2_entry_t *pte2p; + + PMAP_LOCK(pmap); + pte1 = pte1_load(pmap_pte1(pmap, va)); + if (pte1_is_section(pte1)) + pa = pte1_pa(pte1) | (va & PTE1_OFFSET); + else if (pte1_is_link(pte1)) { + pte2p = pmap_pte2(pmap, va); + pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); + pmap_pte2_release(pte2p); + } else + pa = 0; + PMAP_UNLOCK(pmap); + return (pa); +} + +/* + * Routine: pmap_extract_and_hold + * Function: + * Atomically extract and hold the physical page + * with the given pmap and virtual address pair + * if that mapping permits the given protection. + */ +vm_page_t +pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) +{ + vm_paddr_t pa, lockpa; + pt1_entry_t pte1; + pt2_entry_t pte2, *pte2p; + vm_page_t m; + + lockpa = 0; + m = NULL; + PMAP_LOCK(pmap); +retry: + pte1 = pte1_load(pmap_pte1(pmap, va)); + if (pte1_is_section(pte1)) { + if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { + pa = pte1_pa(pte1) | (va & PTE1_OFFSET); + if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) + goto retry; + m = PHYS_TO_VM_PAGE(pa); + vm_page_hold(m); + } + } else if (pte1_is_link(pte1)) { + pte2p = pmap_pte2(pmap, va); + pte2 = pte2_load(pte2p); + pmap_pte2_release(pte2p); + if (pte2_is_valid(pte2) && + (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { + pa = pte2_pa(pte2); + if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) + goto retry; + m = PHYS_TO_VM_PAGE(pa); + vm_page_hold(m); + } + } + PA_UNLOCK_COND(lockpa); + PMAP_UNLOCK(pmap); + return (m); +} + +/* + * Grow the number of kernel L2 page table entries, if needed. + */ +void +pmap_growkernel(vm_offset_t addr) +{ + vm_page_t m; + vm_paddr_t pt2pg_pa, pt2_pa; + pt1_entry_t pte1; + pt2_entry_t pte2; + + PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); + /* + * All the time kernel_vm_end is first KVA for which underlying + * L2 page table is either not allocated or linked from L1 page table + * (not considering sections). Except for two possible cases: + * + * (1) in the very beginning as long as pmap_growkernel() was + * not called, it could be first unused KVA (which is not + * rounded up to PTE1_SIZE), + * + * (2) when all KVA space is mapped and kernel_map->max_offset + * address is not rounded up to PTE1_SIZE. (For example, + * it could be 0xFFFFFFFF.) + */ + kernel_vm_end = pte1_roundup(kernel_vm_end); + mtx_assert(&kernel_map->system_mtx, MA_OWNED); + addr = roundup2(addr, PTE1_SIZE); + if (addr - 1 >= kernel_map->max_offset) + addr = kernel_map->max_offset; + while (kernel_vm_end < addr) { + pte1 = pte1_load(kern_pte1(kernel_vm_end)); + if (pte1_is_valid(pte1)) { + kernel_vm_end += PTE1_SIZE; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + continue; + } + + /* + * kernel_vm_end_new is used in pmap_pinit() when kernel + * mappings are entered to new pmap all at once to avoid race + * between pmap_kenter_pte1() and kernel_vm_end increase. + * The same aplies to pmap_kenter_pt2tab(). + */ + kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; + + pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); + if (!pte2_is_valid(pte2)) { + /* + * Install new PT2s page into kernel PT2TAB. + */ + m = vm_page_alloc(NULL, + pte1_index(kernel_vm_end) & ~PT2PG_MASK, + VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) + panic("%s: no memory to grow kernel", __func__); + /* + * QQQ: To link all new L2 page tables from L1 page + * table now and so pmap_kenter_pte1() them + * at once together with pmap_kenter_pt2tab() + * could be nice speed up. However, + * pmap_growkernel() does not happen so often... + * QQQ: The other TTBR is another option. + */ + pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, + m); + } else + pt2pg_pa = pte2_pa(pte2); + + pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); + pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); + + kernel_vm_end = kernel_vm_end_new; + if (kernel_vm_end - 1 >= kernel_map->max_offset) { + kernel_vm_end = kernel_map->max_offset; + break; + } + } +} + +static int +kvm_size(SYSCTL_HANDLER_ARGS) +{ + unsigned long ksize = vm_max_kernel_address - KERNBASE; + + return (sysctl_handle_long(oidp, &ksize, 0, req)); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_size, "IU", "Size of KVM"); + +static int +kvm_free(SYSCTL_HANDLER_ARGS) +{ + unsigned long kfree = vm_max_kernel_address - kernel_vm_end; + + return (sysctl_handle_long(oidp, &kfree, 0, req)); +} +SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, + 0, 0, kvm_free, "IU", "Amount of KVM free"); + +/*********************************************** + * + * Pmap allocation/deallocation routines. + * + ***********************************************/ + +/* + * Initialize the pmap for the swapper process. + */ +void +pmap_pinit0(pmap_t pmap) +{ + PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); + + PMAP_LOCK_INIT(pmap); + + /* + * Kernel page table directory and pmap stuff around is already + * initialized, we are using it right now and here. So, finish + * only PMAP structures initialization for process0 ... + * + * Since the L1 page table and PT2TAB is shared with the kernel pmap, + * which is already included in the list "allpmaps", this pmap does + * not need to be inserted into that list. + */ + pmap->pm_pt1 = kern_pt1; + pmap->pm_pt2tab = kern_pt2tab; + CPU_ZERO(&pmap->pm_active); + PCPU_SET(curpmap, pmap); + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + CPU_SET(0, &pmap->pm_active); +} + +static __inline void +pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, + vm_offset_t eva) +{ + u_int idx, count; + + idx = pte1_index(sva); + count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); + bcopy(spte1p + idx, dpte1p + idx, count); +} + +static __inline void +pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, + vm_offset_t eva) +{ + u_int idx, count; + + idx = pt2tab_index(sva); + count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); + bcopy(spte2p + idx, dpte2p + idx, count); +} + +/* + * Initialize a preallocated and zeroed pmap structure, + * such as one in a vmspace structure. + */ +int +pmap_pinit(pmap_t pmap) +{ + pt1_entry_t *pte1p; + pt2_entry_t *pte2p; + vm_paddr_t pa, pt2tab_pa; + u_int i; + + PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, + pmap->pm_pt1)); + + /* + * No need to allocate L2 page table space yet but we do need + * a valid L1 page table and PT2TAB table. + * + * Install shared kernel mappings to these tables. It's a little + * tricky as some parts of KVA are reserved for vectors, devices, + * and whatever else. These parts are supposed to be above + * vm_max_kernel_address. Thus two regions should be installed: + * + * (1) . + * + * QQQ: The second region should be stable enough to be installed + * only once in time when the tables are allocated. + * QQQ: Maybe copy of both regions at once could be faster ... + * QQQ: Maybe the other TTBR is an option. + * + * Finally, install own PT2TAB table to these tables. + */ + + if (pmap->pm_pt1 == NULL) { + pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(kernel_arena, + NB_IN_PT1, M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, + pt_memattr); + if (pmap->pm_pt1 == NULL) + return (0); + } + if (pmap->pm_pt2tab == NULL) { + /* + * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page + * only, what should be the only size for 32 bit systems, + * then we could allocate it with vm_page_alloc() and all + * the stuff needed as other L2 page table pages. + * (2) Note that a process PT2TAB is special L2 page table + * page. Its mapping in kernel_arena is permanent and can + * be used no matter which process is current. Its mapping + * in PT2MAP can be used only for current process. + */ + pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(kernel_arena, + NB_IN_PT2TAB, M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); + if (pmap->pm_pt2tab == NULL) { + /* + * QQQ: As struct pmap is allocated from UMA with + * UMA_ZONE_NOFREE flag, it's important to leave + * no allocation in pmap if initialization failed. + */ + kmem_free(kernel_arena, (vm_offset_t)pmap->pm_pt1, + NB_IN_PT1); + pmap->pm_pt1 = NULL; + return (0); + } + /* + * QQQ: Each L2 page table page vm_page_t has pindex set to + * pte1 index of virtual address mapped by this page. + * It's not valid for non kernel PT2TABs themselves. + * The pindex of these pages can not be altered because + * of the way how they are allocated now. However, it + * should not be a problem. + */ + } + + mtx_lock_spin(&allpmaps_lock); + /* + * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), + * kernel_vm_end_new is used here instead of kernel_vm_end. + */ + pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, + kernel_vm_end_new - 1); + pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, + 0xFFFFFFFF); + pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, + kernel_vm_end_new - 1); + pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, + 0xFFFFFFFF); + LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + + /* + * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. + * I.e. self reference mapping. The PT2TAB is private, however mapped + * into shared PT2MAP space, so the mapping should be not global. + */ + pt2tab_pa = vtophys(pmap->pm_pt2tab); + pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); + for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { + pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); + } + + /* Insert PT2MAP PT2s into pmap PT1. */ + pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); + for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { + pte1_store(pte1p++, PTE1_LINK(pa)); + } + + /* + * Now synchronize new mapping which was made above. + */ + pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); + pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); + + CPU_ZERO(&pmap->pm_active); + TAILQ_INIT(&pmap->pm_pvchunk); + bzero(&pmap->pm_stats, sizeof pmap->pm_stats); + + return (1); +} + +#ifdef SMP +/* + * Deal with a SMP shootdown of other users of the pmap that we are + * trying to dispose of. This can be a bit hairy. + */ +static cpuset_t *lazymask; +static ttb_entry_t lazyttb; +static volatile u_int lazywait; + +void +pmap_lazyfix_action(void) +{ + +#ifdef COUNT_IPIS + (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; +#endif + spinlock_enter(); + if (cp15_ttbr_get() == lazyttb) { + cp15_ttbr_set(curthread->td_pcb->pcb_pagedir); + } + CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask); + atomic_store_rel_int(&lazywait, 1); + spinlock_exit(); + +} + +static void +pmap_lazyfix_self(u_int cpuid) +{ + + spinlock_enter(); + if (cp15_ttbr_get() == lazyttb) { + cp15_ttbr_set(curthread->td_pcb->pcb_pagedir); + } + CPU_CLR_ATOMIC(cpuid, lazymask); + spinlock_exit(); +} + +static void +pmap_lazyfix(pmap_t pmap) +{ + cpuset_t mymask, mask; + u_int cpuid, spins; + int lsb; + + mask = pmap->pm_active; + while (!CPU_EMPTY(&mask)) { + spins = 50000000; + + /* Find least significant set bit. */ + lsb = CPU_FFS(&mask); + MPASS(lsb != 0); + lsb--; + CPU_SETOF(lsb, &mask); + mtx_lock_spin(&smp_ipi_mtx); + + lazyttb = pmap_ttb_get(pmap); + cpuid = PCPU_GET(cpuid); + + /* Use a cpuset just for having an easy check. */ + CPU_SETOF(cpuid, &mymask); + if (!CPU_CMP(&mask, &mymask)) { + lazymask = &pmap->pm_active; + pmap_lazyfix_self(cpuid); + } else { + atomic_store_rel_int((u_int *)&lazymask, + (u_int)&pmap->pm_active); + atomic_store_rel_int(&lazywait, 0); + ipi_selected(mask, IPI_LAZYPMAP); + while (lazywait == 0) { + if (--spins == 0) + break; + } + } + mtx_unlock_spin(&smp_ipi_mtx); + if (spins == 0) + printf("%s: spun for 50000000\n", __func__); + mask = pmap->pm_active; + } +} +#else /* SMP */ +/* + * Cleaning up on uniprocessor is easy. For various reasons, we're + * unlikely to have to even execute this code, including the fact + * that the cleanup is deferred until the parent does a wait(2), which + * means that another userland process has run. + */ +static void +pmap_lazyfix(pmap_t pmap) +{ + + if (!CPU_EMPTY(&pmap->pm_active)) { + cp15_ttbr_set(curthread->td_pcb->pcb_pagedir); + CPU_ZERO(&pmap->pm_active); + } +} +#endif /* SMP */ + +#ifdef INVARIANTS +static boolean_t +pt2tab_user_is_empty(pt2_entry_t *tab) +{ + u_int i, end; + + end = pt2tab_index(VM_MAXUSER_ADDRESS); + for (i = 0; i < end; i++) + if (tab[i] != 0) return (FALSE); + return (TRUE); +} +#endif +/* + * Release any resources held by the given physical map. + * Called when a pmap initialized by pmap_pinit is being released. + * Should only be called if the map contains no valid mappings. + */ +void +pmap_release(pmap_t pmap) +{ +#ifdef INVARIANTS + vm_offset_t start, end; +#endif + KASSERT(pmap->pm_stats.resident_count == 0, + ("%s: pmap resident count %ld != 0", __func__, + pmap->pm_stats.resident_count)); + KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), + ("%s: has allocated user PT2(s)", __func__)); + + pmap_lazyfix(pmap); + mtx_lock_spin(&allpmaps_lock); + LIST_REMOVE(pmap, pm_list); + mtx_unlock_spin(&allpmaps_lock); + +#ifdef INVARIANTS + start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); + end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); + bzero((char *)pmap->pm_pt1 + start, end - start); + + start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); + end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); + bzero((char *)pmap->pm_pt2tab + start, end - start); +#endif + /* + * We are leaving PT1 and PT2TAB allocated on released pmap, + * so hopefully UMA vmspace_zone will always be inited with + * UMA_ZONE_NOFREE flag. + */ +} + +/********************************************************* + * + * L2 table pages and their pages management routines. + * + *********************************************************/ + +/* + * Virtual interface for L2 page table wire counting. + * + * Each L2 page table in a page has own counter which counts a number of + * valid mappings in a table. Global page counter counts mappings in all + * tables in a page plus a single itself mapping in PT2TAB. + * + * During a promotion we leave the associated L2 page table counter + * untouched, so the table (strictly speaking a page which holds it) + * is never freed if promoted. + * + * If a page m->wire_count == 1 then no valid mappings exist in any L2 page + * table in the page and the page itself is only mapped in PT2TAB. + */ + +static __inline void +pt2_wirecount_init(vm_page_t m) +{ + u_int i; + + /* + * Note: A page m is allocated with VM_ALLOC_WIRED flag and + * m->wire_count should be already set correctly. + * So, there is no need to set it again herein. + */ + for (i = 0; i < NPT2_IN_PG; i++) + m->md.pt2_wirecount[i] = 0; +} + +static __inline void +pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) +{ + + /* + * Note: A just modificated pte2 (i.e. already allocated) + * is acquiring one extra reference which must be + * explicitly cleared. It influences the KASSERTs herein. + * All L2 page tables in a page always belong to the same + * pmap, so we allow only one extra reference for the page. + */ + KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), + ("%s: PT2 is overflowing ...", __func__)); + KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), + ("%s: PT2PG is overflowing ...", __func__)); + + m->wire_count++; + m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; +} + +static __inline void +pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) +{ + + KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, + ("%s: PT2 is underflowing ...", __func__)); + KASSERT(m->wire_count > 1, + ("%s: PT2PG is underflowing ...", __func__)); + + m->wire_count--; + m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; +} + +static __inline void +pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) +{ + + KASSERT(count <= NPTE2_IN_PT2, + ("%s: invalid count %u", __func__, count)); + KASSERT(m->wire_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], + ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->wire_count, + m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); + + m->wire_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; + m->wire_count += count; + m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; + + KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), + ("%s: PT2PG is overflowed (%u) ...", __func__, m->wire_count)); +} + +static __inline uint32_t +pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) +{ + + return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); +} + +static __inline boolean_t +pt2_is_empty(vm_page_t m, vm_offset_t va) +{ + + return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); +} + +static __inline boolean_t +pt2_is_full(vm_page_t m, vm_offset_t va) +{ + + return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == + NPTE2_IN_PT2); +} + +static __inline boolean_t +pt2pg_is_empty(vm_page_t m) +{ + + return (m->wire_count == 1); +} + +/* + * This routine is called if the L2 page table + * is not mapped correctly. + */ +static vm_page_t +_pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) +{ + uint32_t pte1_idx; + pt1_entry_t *pte1p; + pt2_entry_t pte2; + vm_page_t m; + vm_paddr_t pt2pg_pa, pt2_pa; + + pte1_idx = pte1_index(va); + pte1p = pmap->pm_pt1 + pte1_idx; + + KASSERT(pte1_load(pte1p) == 0, + ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, + pte1_load(pte1p))); + + pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); + if (!pte2_is_valid(pte2)) { + /* + * Install new PT2s page into pmap PT2TAB. + */ + m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK, + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) { + if ((flags & PMAP_ENTER_NOSLEEP) == 0) { + PMAP_UNLOCK(pmap); + rw_wunlock(&pvh_global_lock); + VM_WAIT; + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + } + + /* + * Indicate the need to retry. While waiting, + * the L2 page table page may have been allocated. + */ + return (NULL); + } + pmap->pm_stats.resident_count++; + pt2pg_pa = pmap_pt2pg_init(pmap, va, m); + } else { + pt2pg_pa = pte2_pa(pte2); + m = PHYS_TO_VM_PAGE(pt2pg_pa); + } + + pt2_wirecount_inc(m, pte1_idx); + pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); + pte1_store(pte1p, PTE1_LINK(pt2_pa)); + + return (m); +} + +static vm_page_t +pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) +{ + u_int pte1_idx; + pt1_entry_t *pte1p, pte1; + vm_page_t m; + + pte1_idx = pte1_index(va); +retry: + pte1p = pmap->pm_pt1 + pte1_idx; + pte1 = pte1_load(pte1p); + + /* + * This supports switching from a 1MB page to a + * normal 4K page. + */ + if (pte1_is_section(pte1)) { + (void)pmap_demote_pte1(pmap, pte1p, va); + /* + * Reload pte1 after demotion. + * + * Note: Demotion can even fail as either PT2 is not find for + * the virtual address or PT2PG can not be allocated. + */ + pte1 = pte1_load(pte1p); + } + + /* + * If the L2 page table page is mapped, we just increment the + * hold count, and activate it. + */ + if (pte1_is_link(pte1)) { + m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); + pt2_wirecount_inc(m, pte1_idx); + } else { + /* + * Here if the PT2 isn't mapped, or if it has + * been deallocated. + */ + m = _pmap_allocpte2(pmap, va, flags); + if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) + goto retry; + } + + return (m); +} + +static __inline void +pmap_free_zero_pages(struct spglist *free) +{ + vm_page_t m; + + while ((m = SLIST_FIRST(free)) != NULL) { + SLIST_REMOVE_HEAD(free, plinks.s.ss); + /* Preserve the page's PG_ZERO setting. */ + vm_page_free_toq(m); + } +} + +/* + * Schedule the specified unused L2 page table page to be freed. Specifically, + * add the page to the specified list of pages that will be released to the + * physical memory manager after the TLB has been updated. + */ +static __inline void +pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) +{ + + /* + * Put page on a list so that it is released after + * *ALL* TLB shootdown is done + */ +#ifdef PMAP_DEBUG + pmap_zero_page_check(m); +#endif + m->flags |= PG_ZERO; + SLIST_INSERT_HEAD(free, m, plinks.s.ss); +} + +/* + * Unwire L2 page tables page. + */ +static void +pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + pt1_entry_t *pte1p, opte1 __unused; + pt2_entry_t *pte2p; + uint32_t i; + + KASSERT(pt2pg_is_empty(m), + ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); + + /* + * Unmap all L2 page tables in the page from L1 page table. + * + * QQQ: Individual L2 page tables (except the last one) can be unmapped + * earlier. However, we are doing that this way. + */ + KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), + ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); + pte1p = pmap->pm_pt1 + m->pindex; + for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { + KASSERT(m->md.pt2_wirecount[i] == 0, + ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); + opte1 = pte1_load(pte1p); + if (pte1_is_link(opte1)) + pte1_clear(pte1p); +#ifdef INVARIANTS + else + KASSERT((opte1 == 0) || pte1_is_section(opte1), + ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, + pmap, va, opte1, i)); +#endif + } + + /* + * Unmap the page from PT2TAB. + */ + pte2p = pmap_pt2tab_entry(pmap, va); + (void)pt2tab_load_clear(pte2p); + pmap_tlb_flush(pmap, pt2map_pt2pg(va)); + + m->wire_count = 0; + pmap->pm_stats.resident_count--; + + /* + * This is a release store so that the ordinary store unmapping + * the L2 page table page is globally performed before TLB shoot- + * down is begun. + */ + atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); +} + +/* + * Decrements a L2 page table page's wire count, which is used to record the + * number of valid page table entries within the page. If the wire count + * drops to zero, then the page table page is unmapped. Returns TRUE if the + * page table page was unmapped and FALSE otherwise. + */ +static __inline boolean_t +pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) +{ + pt2_wirecount_dec(m, pte1_index(va)); + if (pt2pg_is_empty(m)) { + /* + * QQQ: Wire count is zero, so whole page should be zero and + * we can set PG_ZERO flag to it. + * Note that when promotion is enabled, it takes some + * more efforts. See pmap_unwire_pt2_all() below. + */ + pmap_unwire_pt2pg(pmap, va, m); + pmap_add_delayed_free_list(m, free); + return (TRUE); + } else + return (FALSE); +} + +/* + * Drop a L2 page table page's wire count at once, which is used to record + * the number of valid L2 page table entries within the page. If the wire + * count drops to zero, then the L2 page table page is unmapped. + */ +static __inline void +pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, + struct spglist *free) +{ + u_int pte1_idx = pte1_index(va); + + KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), + ("%s: PT2 page's pindex is wrong", __func__)); + KASSERT(m->wire_count > pt2_wirecount_get(m, pte1_idx), + ("%s: bad pt2 wire count %u > %u", __func__, m->wire_count, + pt2_wirecount_get(m, pte1_idx))); + + /* + * It's possible that the L2 page table was never used. + * It happened in case that a section was created without promotion. + */ + if (pt2_is_full(m, va)) { + pt2_wirecount_set(m, pte1_idx, 0); + + /* + * QQQ: We clear L2 page table now, so when L2 page table page + * is going to be freed, we can set it PG_ZERO flag ... + * This function is called only on section mappings, so + * hopefully it's not to big overload. + * + * XXX: If pmap is current, existing PT2MAP mapping could be + * used for zeroing. + */ + pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); + } +#ifdef INVARIANTS + else + KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", + __func__, pt2_wirecount_get(m, pte1_idx))); +#endif + if (pt2pg_is_empty(m)) { + pmap_unwire_pt2pg(pmap, va, m); + pmap_add_delayed_free_list(m, free); + } +} + +/* + * After removing a L2 page table entry, this routine is used to + * conditionally free the page, and manage the hold/wire counts. + */ +static boolean_t +pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) +{ + pt1_entry_t pte1; + vm_page_t mpte; + + if (va >= VM_MAXUSER_ADDRESS) + return (FALSE); + pte1 = pte1_load(pmap_pte1(pmap, va)); + mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); + return (pmap_unwire_pt2(pmap, va, mpte, free)); +} + +/************************************* + * + * Page management routines. + * + *************************************/ + +CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); +CTASSERT(_NPCM == 11); +CTASSERT(_NPCPV == 336); + +static __inline struct pv_chunk * +pv_to_chunk(pv_entry_t pv) +{ + + return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); +} + +#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) + +#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ +#define PC_FREE10 0x0000fffful /* Free values for index 10 */ + +static const uint32_t pc_freemask[_NPCM] = { + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, + PC_FREE0_9, PC_FREE10 +}; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, + "Current number of pv entries"); + +#ifdef PV_STATS +static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; + +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, + "Current number of pv entry chunks"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, + "Current number of pv entry chunks allocated"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, + "Current number of pv entry chunks frees"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, + 0, "Number of times tried to get a chunk page but failed."); + +static long pv_entry_frees, pv_entry_allocs; +static int pv_entry_spare; + +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, + "Current number of pv entry frees"); +SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, + 0, "Current number of pv entry allocs"); +SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, + "Current number of spare pv entries"); +#endif + +/* + * Is given page managed? + */ +static __inline boolean_t +is_managed(vm_paddr_t pa) +{ + vm_offset_t pgnum; + vm_page_t m; + + pgnum = atop(pa); + if (pgnum >= first_page) { + m = PHYS_TO_VM_PAGE(pa); + if (m == NULL) + return (FALSE); + if ((m->oflags & VPO_UNMANAGED) == 0) + return (TRUE); + } + return (FALSE); +} + +static __inline boolean_t +pte1_is_managed(pt1_entry_t pte1) +{ + + return (is_managed(pte1_pa(pte1))); +} + +static __inline boolean_t +pte2_is_managed(pt2_entry_t pte2) +{ + + return (is_managed(pte2_pa(pte2))); +} + +/* + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. + */ +static vm_page_t +pmap_pv_reclaim(pmap_t locked_pmap) +{ + struct pch newtail; + struct pv_chunk *pc; + struct md_page *pvh; + pt1_entry_t *pte1p; + pmap_t pmap; + pt2_entry_t *pte2p, tpte2; + pv_entry_t pv; + vm_offset_t va; + vm_page_t m, m_pc; + struct spglist free; + uint32_t inuse; + int bit, field, freed; + + PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); + pmap = NULL; + m_pc = NULL; + SLIST_INIT(&free); + TAILQ_INIT(&newtail); + while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || + SLIST_EMPTY(&free))) { + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + if (pmap != pc->pc_pmap) { + if (pmap != NULL) { + pmap_tlb_flush_ng(pmap); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); + } + pmap = pc->pc_pmap; + /* Avoid deadlock and lock recursion. */ + if (pmap > locked_pmap) + PMAP_LOCK(pmap); + else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { + pmap = NULL; + TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); + continue; + } + } + + /* + * Destroy every non-wired, 4 KB page mapping in the chunk. + */ + freed = 0; + for (field = 0; field < _NPCM; field++) { + for (inuse = ~pc->pc_map[field] & pc_freemask[field]; + inuse != 0; inuse &= ~(1UL << bit)) { + bit = ffs(inuse) - 1; + pv = &pc->pc_pventry[field * 32 + bit]; + va = pv->pv_va; + pte1p = pmap_pte1(pmap, va); + if (pte1_is_section(pte1_load(pte1p))) + continue; + pte2p = pmap_pte2(pmap, va); + tpte2 = pte2_load(pte2p); + if ((tpte2 & PTE2_W) == 0) + tpte2 = pte2_load_clear(pte2p); + pmap_pte2_release(pte2p); + if ((tpte2 & PTE2_W) != 0) + continue; + KASSERT(tpte2 != 0, + ("pmap_pv_reclaim: pmap %p va %#x zero pte", + pmap, va)); + if (pte2_is_global(tpte2)) + tlb_flush(va); + m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); + if (pte2_is_dirty(tpte2)) + vm_page_dirty(m); + if ((tpte2 & PTE2_A) != 0) + vm_page_aflag_set(m, PGA_REFERENCED); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + if (TAILQ_EMPTY(&m->md.pv_list) && + (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) { + vm_page_aflag_clear(m, + PGA_WRITEABLE); + } + } + pc->pc_map[field] |= 1UL << bit; + pmap_unuse_pt2(pmap, va, &free); + freed++; + } + } + if (freed == 0) { + TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); + continue; + } + /* Every freed mapping is for a 4 KB page. */ + pmap->pm_stats.resident_count -= freed; + PV_STAT(pv_entry_frees += freed); + PV_STAT(pv_entry_spare += freed); + pv_entry_count -= freed; + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + for (field = 0; field < _NPCM; field++) + if (pc->pc_map[field] != pc_freemask[field]) { + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, + pc_list); + TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); + + /* + * One freed pv entry in locked_pmap is + * sufficient. + */ + if (pmap == locked_pmap) + goto out; + break; + } + if (field == _NPCM) { + PV_STAT(pv_entry_spare -= _NPCPV); + PV_STAT(pc_chunk_count--); + PV_STAT(pc_chunk_frees++); + /* Entire chunk is free; return it. */ + m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); + pmap_qremove((vm_offset_t)pc, 1); + pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); + break; + } + } +out: + TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); + if (pmap != NULL) { + pmap_tlb_flush_ng(pmap); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); + } + if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { + m_pc = SLIST_FIRST(&free); + SLIST_REMOVE_HEAD(&free, plinks.s.ss); + /* Recycle a freed page table page. */ + m_pc->wire_count = 1; + atomic_add_int(&vm_cnt.v_wire_count, 1); + } + pmap_free_zero_pages(&free); + return (m_pc); +} + +static void +free_pv_chunk(struct pv_chunk *pc) +{ + vm_page_t m; + + TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + PV_STAT(pv_entry_spare -= _NPCPV); + PV_STAT(pc_chunk_count--); + PV_STAT(pc_chunk_frees++); + /* entire chunk is free, return it */ + m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); + pmap_qremove((vm_offset_t)pc, 1); + vm_page_unwire(m, PQ_INACTIVE); + vm_page_free(m); + pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); +} + +/* + * Free the pv_entry back to the free list. + */ +static void +free_pv_entry(pmap_t pmap, pv_entry_t pv) +{ + struct pv_chunk *pc; + int idx, field, bit; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(pv_entry_frees++); + PV_STAT(pv_entry_spare++); + pv_entry_count--; + pc = pv_to_chunk(pv); + idx = pv - &pc->pc_pventry[0]; + field = idx / 32; + bit = idx % 32; + pc->pc_map[field] |= 1ul << bit; + for (idx = 0; idx < _NPCM; idx++) + if (pc->pc_map[idx] != pc_freemask[idx]) { + /* + * 98% of the time, pc is already at the head of the + * list. If it isn't already, move it to the head. + */ + if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != + pc)) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, + pc_list); + } + return; + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); +} + +/* + * Get a new pv_entry, allocating a block from the system + * when needed. + */ +static pv_entry_t +get_pv_entry(pmap_t pmap, boolean_t try) +{ + static const struct timeval printinterval = { 60, 0 }; + static struct timeval lastprint; + int bit, field; + pv_entry_t pv; + struct pv_chunk *pc; + vm_page_t m; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + PV_STAT(pv_entry_allocs++); + pv_entry_count++; + if (pv_entry_count > pv_entry_high_water) + if (ratecheck(&lastprint, &printinterval)) + printf("Approaching the limit on PV entries, consider " + "increasing either the vm.pmap.shpgperproc or the " + "vm.pmap.pv_entry_max tunable.\n"); +retry: + pc = TAILQ_FIRST(&pmap->pm_pvchunk); + if (pc != NULL) { + for (field = 0; field < _NPCM; field++) { + if (pc->pc_map[field]) { + bit = ffs(pc->pc_map[field]) - 1; + break; + } + } + if (field < _NPCM) { + pv = &pc->pc_pventry[field * 32 + bit]; + pc->pc_map[field] &= ~(1ul << bit); + /* If this was the last item, move it to tail */ + for (field = 0; field < _NPCM; field++) + if (pc->pc_map[field] != 0) { + PV_STAT(pv_entry_spare--); + return (pv); /* not full, return */ + } + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(pv_entry_spare--); + return (pv); + } + } + /* + * Access to the pte2list "pv_vafree" is synchronized by the pvh + * global lock. If "pv_vafree" is currently non-empty, it will + * remain non-empty until pmap_pte2list_alloc() completes. + */ + if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { + if (try) { + pv_entry_count--; + PV_STAT(pc_chunk_tryfail++); + return (NULL); + } + m = pmap_pv_reclaim(pmap); + if (m == NULL) + goto retry; + } + PV_STAT(pc_chunk_count++); + PV_STAT(pc_chunk_allocs++); + pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); + pmap_qenter((vm_offset_t)pc, &m, 1); + pc->pc_pmap = pmap; + pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ + for (field = 1; field < _NPCM; field++) + pc->pc_map[field] = pc_freemask[field]; + TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + pv = &pc->pc_pventry[0]; + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(pv_entry_spare += _NPCPV - 1); + return (pv); +} + +/* + * Create a pv entry for page at pa for + * (pmap, va). + */ +static void +pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + pv_entry_t pv; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pv = get_pv_entry(pmap, FALSE); + pv->pv_va = va; + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); +} + +static __inline pv_entry_t +pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + if (pmap == PV_PMAP(pv) && va == pv->pv_va) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + break; + } + } + return (pv); +} + +static void +pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) +{ + pv_entry_t pv; + + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); + free_pv_entry(pmap, pv); +} + +static void +pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) +{ + struct md_page *pvh; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + pmap_pvh_free(&m->md, pmap, va); + if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } +} + +static void +pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + KASSERT((pa & PTE1_OFFSET) == 0, + ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); + + /* + * Transfer the 1mpage's pv entry for this mapping to the first + * page's pv list. + */ + pvh = pa_to_pvh(pa); + va = pte1_trunc(va); + pv = pmap_pvh_remove(pvh, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ + va_last = va + PTE1_SIZE - PAGE_SIZE; + do { + m++; + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_pv_demote_pte1: page %p is not managed", m)); + va += PAGE_SIZE; + pmap_insert_entry(pmap, va, m); + } while (va < va_last); +} + +static void +pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + vm_offset_t va_last; + vm_page_t m; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + KASSERT((pa & PTE1_OFFSET) == 0, + ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); + + /* + * Transfer the first page's pv entry for this mapping to the + * 1mpage's pv list. Aside from avoiding the cost of a call + * to get_pv_entry(), a transfer avoids the possibility that + * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() + * removes one of the mappings that is being promoted. + */ + m = PHYS_TO_VM_PAGE(pa); + va = pte1_trunc(va); + pv = pmap_pvh_remove(&m->md, pmap, va); + KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ + va_last = va + PTE1_SIZE - PAGE_SIZE; + do { + m++; + va += PAGE_SIZE; + pmap_pvh_free(&m->md, pmap, va); + } while (va < va_last); +} + +/* + * Conditionally create a pv entry. + */ +static boolean_t +pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) +{ + pv_entry_t pv; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (pv_entry_count < pv_entry_high_water && + (pv = get_pv_entry(pmap, TRUE)) != NULL) { + pv->pv_va = va; + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + return (TRUE); + } else + return (FALSE); +} + +/* + * Create the pv entries for each of the pages within a section. + */ +static boolean_t +pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) +{ + struct md_page *pvh; + pv_entry_t pv; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + if (pv_entry_count < pv_entry_high_water && + (pv = get_pv_entry(pmap, TRUE)) != NULL) { + pv->pv_va = va; + pvh = pa_to_pvh(pa); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + return (TRUE); + } else + return (FALSE); +} + +/* + * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are + * within a single page table page (PT2) to a single 1MB page mapping. + * For promotion to occur, two conditions must be met: (1) the 4KB page + * mappings must map aligned, contiguous physical memory and (2) the 4KB page + * mappings must have identical characteristics. + * + * Managed (PG_MANAGED) mappings within the kernel address space are not + * promoted. The reason is that kernel PTE1s are replicated in each pmap but + * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only + * read the PTE1 from the kernel pmap. + */ +static void +pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) +{ + pt1_entry_t npte1; + pt2_entry_t *fpte2p, fpte2, fpte2_fav; + pt2_entry_t *pte2p, pte2; + vm_offset_t pteva __unused; + vm_page_t m __unused; + + PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, + pmap, va, pte1_load(pte1p), pte1p)); + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is + * either invalid, unused, or does not map the first 4KB physical page + * within a 1MB page. + */ + fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); +setpte1: + fpte2 = pte2_load(fpte2p); + if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != + (PTE2_A | PTE2_V)) { + pmap_pte1_p_failures++; + CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", + __func__, va, pmap); + return; + } + if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { + pmap_pte1_p_failures++; + CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", + __func__, va, pmap); + return; + } + if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { + /* + * When page is not modified, PTE2_RO can be set without + * a TLB invalidation. + * + * Note: When modified bit is being set, then in harware case, + * the TLB entry is re-read (updated) from PT2, and in + * software case (abort), the PTE2 is read from PT2 and + * TLB flushed if changed. The following cmpset() solves + * any race with setting this bit in both cases. + */ + if (!pte2_cmpset(fpte2p, fpte2, fpte2 | PTE2_RO)) + goto setpte1; + fpte2 |= PTE2_RO; + } + + /* + * Examine each of the other PTE2s in the specified PT2. Abort if this + * PTE2 maps an unexpected 4KB physical page or does not have identical + * characteristics to the first PTE2. + */ + fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); + fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ + for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { +setpte2: + pte2 = pte2_load(pte2p); + if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { + pmap_pte1_p_failures++; + CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", + __func__, va, pmap); + return; + } + if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { + /* + * When page is not modified, PTE2_RO can be set + * without a TLB invalidation. See note above. + */ + if (!pte2_cmpset(pte2p, pte2, pte2 | PTE2_RO)) + goto setpte2; + pte2 |= PTE2_RO; + pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & + PTE2_FRAME); + CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", + __func__, pteva, pmap); + } + if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { + pmap_pte1_p_failures++; + CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", + __func__, va, pmap); + return; + } + + fpte2_fav -= PTE2_SIZE; + } + /* + * The page table page in its current state will stay in PT2TAB + * until the PTE1 mapping the section is demoted by pmap_demote_pte1() + * or destroyed by pmap_remove_pte1(). + * + * Note that L2 page table size is not equal to PAGE_SIZE. + */ + m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); + KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], + ("%s: PT2 page is out of range", __func__)); + KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), + ("%s: PT2 page's pindex is wrong", __func__)); + + /* + * Get pte1 from pte2 format. + */ + npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; + + /* + * Promote the pv entries. + */ + if (pte2_is_managed(fpte2)) + pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); + + /* + * Map the section. + */ + if (pmap == kernel_pmap) + pmap_kenter_pte1(va, npte1); + else + pte1_store(pte1p, npte1); + /* + * Flush old small mappings. We call single pmap_tlb_flush() in + * pmap_demote_pte1() and pmap_remove_pte1(), so we must be sure that + * no small mappings survive. We assume that given pmap is current and + * don't play game with PTE2_NG. + */ + pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); + + pmap_pte1_promotions++; + CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", + __func__, va, pmap); + + PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", + __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); +} + +/* + * Zero L2 page table page. + */ +static __inline void +pmap_clear_pt2(pt2_entry_t *fpte2p) +{ + pt2_entry_t *pte2p; + + for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) + pte2_clear(pte2p); + +} + +/* + * Removes a 1MB page mapping from the kernel pmap. + */ +static void +pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) +{ + vm_page_t m; + uint32_t pte1_idx; + pt2_entry_t *fpte2p; + vm_paddr_t pt2_pa; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + m = pmap_pt2_page(pmap, va); + if (m == NULL) + /* + * QQQ: Is this function called only on promoted pte1? + * We certainly do section mappings directly + * (without promotion) in kernel !!! + */ + panic("%s: missing pt2 page", __func__); + + pte1_idx = pte1_index(va); + + /* + * Initialize the L2 page table. + */ + fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); + pmap_clear_pt2(fpte2p); + + /* + * Remove the mapping. + */ + pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); + pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); + + /* + * QQQ: We do not need to invalidate PT2MAP mapping + * as we did not change it. I.e. the L2 page table page + * was and still is mapped the same way. + */ +} + +/* + * Do the things to unmap a section in a process + */ +static void +pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, + struct spglist *free) +{ + pt1_entry_t opte1; + struct md_page *pvh; + vm_offset_t eva, va; + vm_page_t m; + + PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, + pte1_load(pte1p), pte1p)); + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & PTE1_OFFSET) == 0, + ("%s: sva is not 1mpage aligned", __func__)); + + opte1 = pte1_load_clear(pte1p); + if (pte1_is_wired(opte1)) + pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; + + /* + * If the mapping was global, invalidate it even if given pmap + * is not active (kernel_pmap is active always). The mapping should + * occupy one and only TLB entry. So, pmap_tlb_flush() called + * with aligned address should be sufficient. + */ + if (pte1_is_global(opte1)) + tlb_flush(sva); + pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; + if (pte1_is_managed(opte1)) { + pvh = pa_to_pvh(pte1_pa(opte1)); + pmap_pvh_free(pvh, pmap, sva); + eva = sva + PTE1_SIZE; + for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); + va < eva; va += PAGE_SIZE, m++) { + if (pte1_is_dirty(opte1)) + vm_page_dirty(m); + if (opte1 & PTE1_A) + vm_page_aflag_set(m, PGA_REFERENCED); + if (TAILQ_EMPTY(&m->md.pv_list) && + TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + } + if (pmap == kernel_pmap) { + /* + * L2 page table(s) can't be removed from kernel map as + * kernel counts on it (stuff around pmap_growkernel()). + */ + pmap_remove_kernel_pte1(pmap, pte1p, sva); + } else { + /* + * Get associated L2 page table page. + * It's possible that the page was never allocated. + */ + m = pmap_pt2_page(pmap, sva); + if (m != NULL) + pmap_unwire_pt2_all(pmap, sva, m, free); + } +} + +/* + * Fills L2 page table page with mappings to consecutive physical pages. + */ +static __inline void +pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) +{ + pt2_entry_t *pte2p; + + for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { + pte2_store(pte2p, npte2); + npte2 += PTE2_SIZE; + } +} + +/* + * Tries to demote a 1MB page mapping. If demotion fails, the + * 1MB page mapping is invalidated. + */ +static boolean_t +pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) +{ + pt1_entry_t opte1, npte1; + pt2_entry_t *fpte2p, npte2; + vm_paddr_t pt2pg_pa, pt2_pa; + vm_page_t m; + struct spglist free; + uint32_t pte1_idx, isnew = 0; + + PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, + pmap, va, pte1_load(pte1p), pte1p)); + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + opte1 = pte1_load(pte1p); + KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); + + if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { + KASSERT(!pte1_is_wired(opte1), + ("%s: PT2 page for a wired mapping is missing", __func__)); + + /* + * Invalidate the 1MB page mapping and return + * "failure" if the mapping was never accessed or the + * allocation of the new page table page fails. + */ + if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL, + pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ | + VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { + SLIST_INIT(&free); + pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); + pmap_tlb_flush(pmap, pte1_trunc(va)); + pmap_free_zero_pages(&free); + CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", + __func__, va, pmap); + return (FALSE); + } + if (va < VM_MAXUSER_ADDRESS) + pmap->pm_stats.resident_count++; + + isnew = 1; + + /* + * We init all L2 page tables in the page even if + * we are going to change everything for one L2 page + * table in a while. + */ + pt2pg_pa = pmap_pt2pg_init(pmap, va, m); + } else { + if (va < VM_MAXUSER_ADDRESS) { + if (pt2_is_empty(m, va)) + isnew = 1; /* Demoting section w/o promotion. */ +#ifdef INVARIANTS + else + KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" + " count %u", __func__, + pt2_wirecount_get(m, pte1_index(va)))); +#endif + } + } + + pt2pg_pa = VM_PAGE_TO_PHYS(m); + pte1_idx = pte1_index(va); + /* + * If the pmap is current, then the PT2MAP can provide access to + * the page table page (promoted L2 page tables are not unmapped). + * Otherwise, temporarily map the L2 page table page (m) into + * the kernel's address space at either PADDR1 or PADDR2. + * + * Note that L2 page table size is not equal to PAGE_SIZE. + */ + if (pmap_is_current(pmap)) + fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); + else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { + if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { + pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); +#ifdef SMP + PMAP1cpu = PCPU_GET(cpuid); +#endif + tlb_flush_local((vm_offset_t)PADDR1); + PMAP1changed++; + } else +#ifdef SMP + if (PMAP1cpu != PCPU_GET(cpuid)) { + PMAP1cpu = PCPU_GET(cpuid); + tlb_flush_local((vm_offset_t)PADDR1); + PMAP1changedcpu++; + } else +#endif + PMAP1unchanged++; + fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); + } else { + mtx_lock(&PMAP2mutex); + if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { + pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); + tlb_flush((vm_offset_t)PADDR2); + } + fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); + } + pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); + npte1 = PTE1_LINK(pt2_pa); + + KASSERT((opte1 & PTE1_A) != 0, + ("%s: opte1 is missing PTE1_A", __func__)); + KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, + ("%s: opte1 has PTE1_NM", __func__)); + + /* + * Get pte2 from pte1 format. + */ + npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; + + /* + * If the L2 page table page is new, initialize it. If the mapping + * has changed attributes, update the page table entries. + */ + if (isnew != 0) { + pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); + pmap_fill_pt2(fpte2p, npte2); + } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != + (npte2 & PTE2_PROMOTE)) + pmap_fill_pt2(fpte2p, npte2); + + KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), + ("%s: fpte2p and npte2 map different physical addresses", + __func__)); + + if (fpte2p == PADDR2) + mtx_unlock(&PMAP2mutex); + + /* + * Demote the mapping. This pmap is locked. The old PTE1 has + * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also + * has not PTE1_NM set. Thus, there is no danger of a race with + * another processor changing the setting of PTE1_A and/or PTE1_NM + * between the read above and the store below. + */ + if (pmap == kernel_pmap) + pmap_kenter_pte1(va, npte1); + else + pte1_store(pte1p, npte1); + + /* + * Flush old big mapping. The mapping should occupy one and only + * TLB entry. So, pmap_tlb_flush() called with aligned address + * should be sufficient. + */ + pmap_tlb_flush(pmap, pte1_trunc(va)); + + /* + * Demote the pv entry. This depends on the earlier demotion + * of the mapping. Specifically, the (re)creation of a per- + * page pv entry might trigger the execution of pmap_pv_reclaim(), + * which might reclaim a newly (re)created per-page pv entry + * and destroy the associated mapping. In order to destroy + * the mapping, the PTE1 must have already changed from mapping + * the 1mpage to referencing the page table page. + */ + if (pte1_is_managed(opte1)) + pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); + + pmap_pte1_demotions++; + CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", + __func__, va, pmap); + + PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", + __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); + return (TRUE); +} + +/* + * Insert the given physical page (p) at + * the specified virtual address (v) in the + * target physical map with the protection requested. + * + * If specified, the page will be wired down, meaning + * that the related pte can not be reclaimed. + * + * NB: This is the only routine which MAY NOT lazy-evaluate + * or lose information. That is, this routine must actually + * insert this page into the given map NOW. + */ +int +pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, + u_int flags, int8_t psind) +{ + pt1_entry_t *pte1p; + pt2_entry_t *pte2p; + pt2_entry_t npte2, opte2; + pv_entry_t pv; + vm_paddr_t opa, pa; + vm_page_t mpte2, om; + boolean_t wired; + + va = trunc_page(va); + mpte2 = NULL; + wired = (flags & PMAP_ENTER_WIRED) != 0; + + KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); + KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, + ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, + va)); + if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) + VM_OBJECT_ASSERT_LOCKED(m->object); + + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + sched_pin(); + + /* + * In the case that a page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + mpte2 = pmap_allocpte2(pmap, va, flags); + if (mpte2 == NULL) { + KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, + ("pmap_allocpte2 failed with sleep allowed")); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + return (KERN_RESOURCE_SHORTAGE); + } + } + pte1p = pmap_pte1(pmap, va); + if (pte1_is_section(pte1_load(pte1p))) + panic("%s: attempted on 1MB page", __func__); + pte2p = pmap_pte2_quick(pmap, va); + if (pte2p == NULL) + panic("%s: invalid L1 page table entry va=%#x", __func__, va); + + om = NULL; + pa = VM_PAGE_TO_PHYS(m); + opte2 = pte2_load(pte2p); + opa = pte2_pa(opte2); + /* + * Mapping has not changed, must be protection or wiring change. + */ + if (pte2_is_valid(opte2) && (opa == pa)) { + /* + * Wiring change, just update stats. We don't worry about + * wiring PT2 pages as they remain resident as long as there + * are valid mappings in them. Hence, if a user page is wired, + * the PT2 page will be also. + */ + if (wired && !pte2_is_wired(opte2)) + pmap->pm_stats.wired_count++; + else if (!wired && pte2_is_wired(opte2)) + pmap->pm_stats.wired_count--; + + /* + * Remove extra pte2 reference + */ + if (mpte2) + pt2_wirecount_dec(mpte2, pte1_index(va)); + if (pte2_is_managed(opte2)) + om = m; + goto validate; + } + + /* + * QQQ: We think that changing physical address on writeable mapping + * is not safe. Well, maybe on kernel address space with correct + * locking, it can make a sense. However, we have no idea why + * anyone should do that on user address space. Are we wrong? + */ + KASSERT((opa == 0) || (opa == pa) || + !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), + ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", + __func__, pmap, va, opte2, opa, pa, flags, prot)); + + pv = NULL; + + /* + * Mapping has changed, invalidate old range and fall through to + * handle validating new mapping. + */ + if (opa) { + if (pte2_is_wired(opte2)) + pmap->pm_stats.wired_count--; + if (pte2_is_managed(opte2)) { + om = PHYS_TO_VM_PAGE(opa); + pv = pmap_pvh_remove(&om->md, pmap, va); + } + /* + * Remove extra pte2 reference + */ + if (mpte2 != NULL) + pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); + } else + pmap->pm_stats.resident_count++; + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0) { + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, + ("%s: managed mapping within the clean submap", __func__)); + if (pv == NULL) + pv = get_pv_entry(pmap, FALSE); + pv->pv_va = va; + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + } else if (pv != NULL) + free_pv_entry(pmap, pv); + + /* + * Increment counters + */ + if (wired) + pmap->pm_stats.wired_count++; + +validate: + /* + * Now validate mapping with desired protection/wiring. + */ + npte2 = PTE2(pa, PTE2_NM, m->md.pat_mode); + if (prot & VM_PROT_WRITE) { + if (pte2_is_managed(npte2)) + vm_page_aflag_set(m, PGA_WRITEABLE); + } + else + npte2 |= PTE2_RO; + if ((prot & VM_PROT_EXECUTE) == 0) + npte2 |= PTE2_NX; + if (wired) + npte2 |= PTE2_W; + if (va < VM_MAXUSER_ADDRESS) + npte2 |= PTE2_U; + if (pmap != kernel_pmap) + npte2 |= PTE2_NG; + + /* + * If the mapping or permission bits are different, we need + * to update the pte2. + * + * QQQ: Think again and again what to do + * if the mapping is going to be changed! + */ + if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { + /* + * Sync icache if exec permission and attribute PTE2_ATTR_WB_WA + * is set. Do it now, before the mapping is stored and made + * valid for hardware table walk. If done later, there is a race + * for other threads of current process in lazy loading case. + * + * QQQ: (1) Does it exist any better way where + * or how to sync icache? + * (2) Now, we do it on a page basis. + */ + if ((prot & VM_PROT_EXECUTE) && + (m->md.pat_mode == PTE2_ATTR_WB_WA) && + ((opa != pa) || (opte2 & PTE2_NX))) + cache_icache_sync_fresh(va, pa, PAGE_SIZE); + + npte2 |= PTE2_A; + if (flags & VM_PROT_WRITE) + npte2 &= ~PTE2_NM; + if (opte2 & PTE2_V) { + /* Change mapping with break-before-make approach. */ + opte2 = pte2_load_clear(pte2p); + pmap_tlb_flush(pmap, va); + pte2_store(pte2p, npte2); + if (opte2 & PTE2_A) { + if (pte2_is_managed(opte2)) + vm_page_aflag_set(om, PGA_REFERENCED); + } + if (pte2_is_dirty(opte2)) { + if (pte2_is_managed(opte2)) + vm_page_dirty(om); + } + if (pte2_is_managed(opte2) && + TAILQ_EMPTY(&om->md.pv_list) && + ((om->flags & PG_FICTITIOUS) != 0 || + TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) + vm_page_aflag_clear(om, PGA_WRITEABLE); + } else + pte2_store(pte2p, npte2); + } +#if 0 + else { + /* + * QQQ: In time when both access and not mofified bits are + * emulated by software, this should not happen. Some + * analysis is need, if this really happen. Missing + * tlb flush somewhere could be the reason. + */ + panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, + va, opte2, npte2); + } +#endif + /* + * If both the L2 page table page and the reservation are fully + * populated, then attempt promotion. + */ + if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && + sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && + vm_reserv_level_iffullpop(m) == 0) + pmap_promote_pte1(pmap, pte1p, va); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + return (KERN_SUCCESS); +} + +/* + * Do the things to unmap a page in a process. + */ +static int +pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, + struct spglist *free) +{ + pt2_entry_t opte2; + vm_page_t m; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + opte2 = pte2_load_clear(pte2p); + KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", + __func__, pmap, va, opte2)); + if (opte2 & PTE2_W) + pmap->pm_stats.wired_count -= 1; + /* + * If the mapping was global, invalidate it even if given pmap + * is not active (kernel_pmap is active always). + */ + if (pte2_is_global(opte2)) + tlb_flush(va); + pmap->pm_stats.resident_count -= 1; + if (pte2_is_managed(opte2)) { + m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); + if (pte2_is_dirty(opte2)) + vm_page_dirty(m); + if (opte2 & PTE2_A) + vm_page_aflag_set(m, PGA_REFERENCED); + pmap_remove_entry(pmap, m, va); + } + return (pmap_unuse_pt2(pmap, va, free)); +} + +/* + * Remove a single page from a process address space. + */ +static void +pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) +{ + pt2_entry_t *pte2p; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + KASSERT(curthread->td_pinned > 0, + ("%s: curthread not pinned", __func__)); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || + !pte2_is_valid(pte2_load(pte2p))) + return; + pmap_remove_pte2(pmap, pte2p, va, free); + pmap_tlb_flush(pmap, va); +} + +/* + * Remove the given range of addresses from the specified map. + * + * It is assumed that the start and end are properly + * rounded to the page size. + */ +void +pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t nextva; + pt1_entry_t *pte1p, pte1; + pt2_entry_t *pte2p, pte2; + struct spglist free; + int anyvalid; + + /* + * Perform an unsynchronized read. This is, however, safe. + */ + if (pmap->pm_stats.resident_count == 0) + return; + + anyvalid = 0; + SLIST_INIT(&free); + + rw_wlock(&pvh_global_lock); + sched_pin(); + PMAP_LOCK(pmap); + + /* + * Special handling of removing one page. A very common + * operation and easy to short circuit some code. + */ + if (sva + PAGE_SIZE == eva) { + pte1 = pte1_load(pmap_pte1(pmap, sva)); + if (pte1_is_link(pte1)) { + pmap_remove_page(pmap, sva, &free); + goto out; + } + } + + for (; sva < eva; sva = nextva) { + /* + * Calculate address for next L2 page table. + */ + nextva = pte1_trunc(sva + PTE1_SIZE); + if (nextva < sva) + nextva = eva; + if (pmap->pm_stats.resident_count == 0) + break; + + pte1p = pmap_pte1(pmap, sva); + pte1 = pte1_load(pte1p); + + /* + * Weed out invalid mappings. Note: we assume that the L1 page + * table is always allocated, and in kernel virtual. + */ + if (pte1 == 0) + continue; + + if (pte1_is_section(pte1)) { + /* + * Are we removing the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + PTE1_SIZE == nextva && eva >= nextva) { + /* + * The TLB entry for global mapping is + * invalidated by pmap_remove_pte1(). + */ + if (!pte1_is_global(pte1)) + anyvalid = 1; + pmap_remove_pte1(pmap, pte1p, sva, &free); + continue; + } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { + /* The large page mapping was destroyed. */ + continue; + } +#ifdef INVARIANTS + else { + /* Update pte1 after demotion. */ + pte1 = pte1_load(pte1p); + } +#endif + } + + KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" + " is not link", __func__, pmap, sva, pte1, pte1p)); + + /* + * Limit our scan to either the end of the va represented + * by the current L2 page table page, or to the end of the + * range being removed. + */ + if (nextva > eva) + nextva = eva; + + for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; + pte2p++, sva += PAGE_SIZE) { + pte2 = pte2_load(pte2p); + if (!pte2_is_valid(pte2)) + continue; + + /* + * The TLB entry for global mapping is invalidated + * by pmap_remove_pte2(). + */ + if (!pte2_is_global(pte2)) + anyvalid = 1; + if (pmap_remove_pte2(pmap, pte2p, sva, &free)) + break; + } + } +out: + sched_unpin(); + if (anyvalid) + pmap_tlb_flush_ng(pmap); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + pmap_free_zero_pages(&free); +} + +/* + * Routine: pmap_remove_all + * Function: + * Removes this physical page from + * all physical maps in which it resides. + * Reflects back modify bits to the pager. + * + * Notes: + * Original versions of this routine were very + * inefficient because they iteratively called + * pmap_remove (slow...) + */ + +void +pmap_remove_all(vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv; + pmap_t pmap; + pt2_entry_t *pte2p, opte2; + pt1_entry_t *pte1p; + vm_offset_t va; + struct spglist free; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("%s: page %p is not managed", __func__, m)); + SLIST_INIT(&free); + rw_wlock(&pvh_global_lock); + sched_pin(); + if ((m->flags & PG_FICTITIOUS) != 0) + goto small_mappings; + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte1p = pmap_pte1(pmap, va); + (void)pmap_demote_pte1(pmap, pte1p, va); + PMAP_UNLOCK(pmap); + } +small_mappings: + while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pmap->pm_stats.resident_count--; + pte1p = pmap_pte1(pmap, pv->pv_va); + KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " + "a 1mpage in page %p's pv list", __func__, m)); + pte2p = pmap_pte2_quick(pmap, pv->pv_va); + opte2 = pte2_load_clear(pte2p); + KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", + __func__, pmap, pv->pv_va)); + if (pte2_is_wired(opte2)) + pmap->pm_stats.wired_count--; + if (opte2 & PTE2_A) + vm_page_aflag_set(m, PGA_REFERENCED); + + /* + * Update the vm_page_t clean and reference bits. + */ + if (pte2_is_dirty(opte2)) + vm_page_dirty(m); + pmap_unuse_pt2(pmap, pv->pv_va, &free); + pmap_tlb_flush(pmap, pv->pv_va); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + free_pv_entry(pmap, pv); + PMAP_UNLOCK(pmap); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + sched_unpin(); + rw_wunlock(&pvh_global_lock); + pmap_free_zero_pages(&free); +} + +/* + * Just subroutine for pmap_remove_pages() to reasonably satisfy + * good coding style, a.k.a. 80 character line width limit hell. + */ +static __inline void +pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, + struct spglist *free) +{ + vm_paddr_t pa; + vm_page_t m, mt, mpt2pg; + struct md_page *pvh; + + pa = pte1_pa(pte1); + m = PHYS_TO_VM_PAGE(pa); + + KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", + __func__, m, m->phys_addr, pa)); + KASSERT((m->flags & PG_FICTITIOUS) != 0 || + m < &vm_page_array[vm_page_array_size], + ("%s: bad pte1 %#x", __func__, pte1)); + + if (pte1_is_dirty(pte1)) { + for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) + vm_page_dirty(mt); + } + + pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; + pvh = pa_to_pvh(pa); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) + if (TAILQ_EMPTY(&mt->md.pv_list)) + vm_page_aflag_clear(mt, PGA_WRITEABLE); + } + mpt2pg = pmap_pt2_page(pmap, pv->pv_va); + if (mpt2pg != NULL) + pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); +} + +/* + * Just subroutine for pmap_remove_pages() to reasonably satisfy + * good coding style, a.k.a. 80 character line width limit hell. + */ +static __inline void +pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, + struct spglist *free) +{ + vm_paddr_t pa; + vm_page_t m; + struct md_page *pvh; + + pa = pte2_pa(pte2); + m = PHYS_TO_VM_PAGE(pa); + + KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", + __func__, m, m->phys_addr, pa)); + KASSERT((m->flags & PG_FICTITIOUS) != 0 || + m < &vm_page_array[vm_page_array_size], + ("%s: bad pte2 %#x", __func__, pte2)); + + if (pte2_is_dirty(pte2)) + vm_page_dirty(m); + + pmap->pm_stats.resident_count--; + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(pa); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + pmap_unuse_pt2(pmap, pv->pv_va, free); +} + +/* + * Remove all pages from specified address space this aids process + * exit speeds. Also, this code is special cased for current process + * only, but can have the more generic (and slightly slower) mode enabled. + * This is much faster than pmap_remove in the case of running down + * an entire address space. + */ +void +pmap_remove_pages(pmap_t pmap) +{ + pt1_entry_t *pte1p, pte1; + pt2_entry_t *pte2p, pte2; + pv_entry_t pv; + struct pv_chunk *pc, *npc; + struct spglist free; + int field, idx; + int32_t bit; + uint32_t inuse, bitmask; + boolean_t allfree; + + if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { + printf("warning: %s called with non-current pmap\n", __func__); + return; + } + SLIST_INIT(&free); + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + sched_pin(); + TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { + KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", + __func__, pmap, pc->pc_pmap)); + allfree = TRUE; + for (field = 0; field < _NPCM; field++) { + inuse = (~(pc->pc_map[field])) & pc_freemask[field]; + while (inuse != 0) { + bit = ffs(inuse) - 1; + bitmask = 1UL << bit; + idx = field * 32 + bit; + pv = &pc->pc_pventry[idx]; + inuse &= ~bitmask; + + /* + * Note that we cannot remove wired pages + * from a process' mapping at this time + */ + pte1p = pmap_pte1(pmap, pv->pv_va); + pte1 = pte1_load(pte1p); + if (pte1_is_section(pte1)) { + if (pte1_is_wired(pte1)) { + allfree = FALSE; + continue; + } + pte1_clear(pte1p); + pmap_remove_pte1_quick(pmap, pte1, pv, + &free); + } + else if (pte1_is_link(pte1)) { + pte2p = pt2map_entry(pv->pv_va); + pte2 = pte2_load(pte2p); + + if (!pte2_is_valid(pte2)) { + printf("%s: pmap %p va %#x " + "pte2 %#x\n", __func__, + pmap, pv->pv_va, pte2); + panic("bad pte2"); + } + + if (pte2_is_wired(pte2)) { + allfree = FALSE; + continue; + } + pte2_clear(pte2p); + pmap_remove_pte2_quick(pmap, pte2, pv, + &free); + } else { + printf("%s: pmap %p va %#x pte1 %#x\n", + __func__, pmap, pv->pv_va, pte1); + panic("bad pte1"); + } + + /* Mark free */ + PV_STAT(pv_entry_frees++); + PV_STAT(pv_entry_spare++); + pv_entry_count--; + pc->pc_map[field] |= bitmask; + } + } + if (allfree) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + free_pv_chunk(pc); + } + } + sched_unpin(); + pmap_tlb_flush_ng(pmap); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); + pmap_free_zero_pages(&free); +} + +/* + * This code makes some *MAJOR* assumptions: + * 1. Current pmap & pmap exists. + * 2. Not wired. + * 3. Read access. + * 4. No L2 page table pages. + * but is *MUCH* faster than pmap_enter... + */ +static vm_page_t +pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, + vm_prot_t prot, vm_page_t mpt2pg) +{ + pt2_entry_t *pte2p, pte2; + vm_paddr_t pa; + struct spglist free; + uint32_t l2prot; + + KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || + (m->oflags & VPO_UNMANAGED) != 0, + ("%s: managed mapping within the clean submap", __func__)); + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * In the case that a L2 page table page is not + * resident, we are creating it here. + */ + if (va < VM_MAXUSER_ADDRESS) { + u_int pte1_idx; + pt1_entry_t pte1, *pte1p; + vm_paddr_t pt2_pa; + + /* + * Get L1 page table things. + */ + pte1_idx = pte1_index(va); + pte1p = pmap_pte1(pmap, va); + pte1 = pte1_load(pte1p); + + if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { + /* + * Each of NPT2_IN_PG L2 page tables on the page can + * come here. Make sure that associated L1 page table + * link is established. + * + * QQQ: It comes that we don't establish all links to + * L2 page tables for newly allocated L2 page + * tables page. + */ + KASSERT(!pte1_is_section(pte1), + ("%s: pte1 %#x is section", __func__, pte1)); + if (!pte1_is_link(pte1)) { + pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), + pte1_idx); + pte1_store(pte1p, PTE1_LINK(pt2_pa)); + } + pt2_wirecount_inc(mpt2pg, pte1_idx); + } else { + /* + * If the L2 page table page is mapped, we just + * increment the hold count, and activate it. + */ + if (pte1_is_section(pte1)) { + return (NULL); + } else if (pte1_is_link(pte1)) { + mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); + pt2_wirecount_inc(mpt2pg, pte1_idx); + } else { + mpt2pg = _pmap_allocpte2(pmap, va, + PMAP_ENTER_NOSLEEP); + if (mpt2pg == NULL) + return (NULL); + } + } + } else { + mpt2pg = NULL; + } + + /* + * This call to pt2map_entry() makes the assumption that we are + * entering the page into the current pmap. In order to support + * quick entry into any pmap, one would likely use pmap_pte2_quick(). + * But that isn't as quick as pt2map_entry(). + */ + pte2p = pt2map_entry(va); + pte2 = pte2_load(pte2p); + if (pte2_is_valid(pte2)) { + if (mpt2pg != NULL) { + /* + * Remove extra pte2 reference + */ + pt2_wirecount_dec(mpt2pg, pte1_index(va)); + mpt2pg = NULL; + } + return (NULL); + } + + /* + * Enter on the PV list if part of our managed memory. + */ + if ((m->oflags & VPO_UNMANAGED) == 0 && + !pmap_try_insert_pv_entry(pmap, va, m)) { + if (mpt2pg != NULL) { + SLIST_INIT(&free); + if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { + pmap_tlb_flush(pmap, va); + pmap_free_zero_pages(&free); + } + + mpt2pg = NULL; + } + return (NULL); + } + + /* + * Increment counters + */ + pmap->pm_stats.resident_count++; + + /* + * Now validate mapping with RO protection + */ + pa = VM_PAGE_TO_PHYS(m); + l2prot = PTE2_RO | PTE2_NM; + if (va < VM_MAXUSER_ADDRESS) + l2prot |= PTE2_U | PTE2_NG; + if ((prot & VM_PROT_EXECUTE) == 0) + l2prot |= PTE2_NX; + else if (m->md.pat_mode == PTE2_ATTR_WB_WA) { + /* + * Sync icache if exec permission and attribute PTE2_ATTR_WB_WA + * is set. QQQ: For more info, see comments in pmap_enter(). + */ + cache_icache_sync_fresh(va, pa, PAGE_SIZE); + } + pte2_store(pte2p, PTE2(pa, l2prot, m->md.pat_mode)); + + return (mpt2pg); +} + +void +pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) +{ + + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); +} + +/* + * Tries to create 1MB page mapping. Returns TRUE if successful and + * FALSE otherwise. Fails if (1) a page table page cannot be allocated without + * blocking, (2) a mapping already exists at the specified virtual address, or + * (3) a pv entry cannot be allocated without reclaiming another pv entry. + */ +static boolean_t +pmap_enter_pte1(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) +{ + pt1_entry_t *pte1p; + vm_paddr_t pa; + uint32_t l1prot; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pte1p = pmap_pte1(pmap, va); + if (pte1_is_valid(pte1_load(pte1p))) { + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, + va, pmap); + return (FALSE); + } + if ((m->oflags & VPO_UNMANAGED) == 0) { + /* + * Abort this mapping if its PV entry could not be created. + */ + if (!pmap_pv_insert_pte1(pmap, va, VM_PAGE_TO_PHYS(m))) { + CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", + __func__, va, pmap); + return (FALSE); + } + } + /* + * Increment counters. + */ + pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; + + /* + * Map the section. + * + * QQQ: Why VM_PROT_WRITE is not evaluated and the mapping is + * made readonly? + */ + pa = VM_PAGE_TO_PHYS(m); + l1prot = PTE1_RO | PTE1_NM; + if (va < VM_MAXUSER_ADDRESS) + l1prot |= PTE1_U | PTE1_NG; + if ((prot & VM_PROT_EXECUTE) == 0) + l1prot |= PTE1_NX; + else if (m->md.pat_mode == PTE2_ATTR_WB_WA) { + /* + * Sync icache if exec permission and attribute PTE2_ATTR_WB_WA + * is set. QQQ: For more info, see comments in pmap_enter(). + */ + cache_icache_sync_fresh(va, pa, PTE1_SIZE); + } + pte1_store(pte1p, PTE1(pa, l1prot, ATTR_TO_L1(m->md.pat_mode))); + + pmap_pte1_mappings++; + CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, + pmap); + return (TRUE); +} + +/* + * Maps a sequence of resident pages belonging to the same object. + * The sequence begins with the given page m_start. This page is + * mapped at the given virtual address start. Each subsequent page is + * mapped at a virtual address that is offset from start by the same + * amount as the page is offset from m_start within the object. The + * last page in the sequence is the page with the largest offset from + * m_start that can be mapped at a virtual address less than the given + * virtual address end. Not every virtual page between start and end + * is mapped; only those for which a resident page exists with the + * corresponding offset from m_start are mapped. + */ +void +pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, + vm_page_t m_start, vm_prot_t prot) +{ + vm_offset_t va; + vm_page_t m, mpt2pg; + vm_pindex_t diff, psize; + + PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", + __func__, pmap, start, end, m_start, prot)); + + VM_OBJECT_ASSERT_LOCKED(m_start->object); + psize = atop(end - start); + mpt2pg = NULL; + m = m_start; + rw_wlock(&pvh_global_lock); + PMAP_LOCK(pmap); + while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { + va = start + ptoa(diff); + if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && + m->psind == 1 && sp_enabled && + pmap_enter_pte1(pmap, va, m, prot)) + m = &m[PTE1_SIZE / PAGE_SIZE - 1]; + else + mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, + mpt2pg); + m = TAILQ_NEXT(m, listq); + } + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(pmap); +} + +/* + * This code maps large physical mmap regions into the + * processor address space. Note that some shortcuts + * are taken, but the code works. + */ +void +pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, + vm_pindex_t pindex, vm_size_t size) +{ + pt1_entry_t *pte1p; + vm_paddr_t pa, pte2_pa; + vm_page_t p; + int pat_mode; + u_int l1attr, l1prot; + + VM_OBJECT_ASSERT_WLOCKED(object); + KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, + ("%s: non-device object", __func__)); + if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { + if (!vm_object_populate(object, pindex, pindex + atop(size))) + return; + p = vm_page_lookup(object, pindex); + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("%s: invalid page %p", __func__, p)); + pat_mode = p->md.pat_mode; + + /* + * Abort the mapping if the first page is not physically + * aligned to a 1MB page boundary. + */ + pte2_pa = VM_PAGE_TO_PHYS(p); + if (pte2_pa & PTE1_OFFSET) + return; + + /* + * Skip the first page. Abort the mapping if the rest of + * the pages are not physically contiguous or have differing + * memory attributes. + */ + p = TAILQ_NEXT(p, listq); + for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; + pa += PAGE_SIZE) { + KASSERT(p->valid == VM_PAGE_BITS_ALL, + ("%s: invalid page %p", __func__, p)); + if (pa != VM_PAGE_TO_PHYS(p) || + pat_mode != p->md.pat_mode) + return; + p = TAILQ_NEXT(p, listq); + } + + /* + * Map using 1MB pages. + * + * QQQ: Well, we are mapping a section, so same condition must + * be hold like during promotion. It looks that only RW mapping + * is done here, so readonly mapping must be done elsewhere. + */ + l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; + l1attr = ATTR_TO_L1(pat_mode); + PMAP_LOCK(pmap); + for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { + pte1p = pmap_pte1(pmap, addr); + if (!pte1_is_valid(pte1_load(pte1p))) { + pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); + pmap->pm_stats.resident_count += PTE1_SIZE / + PAGE_SIZE; + pmap_pte1_mappings++; + } + /* Else continue on if the PTE1 is already valid. */ + addr += PTE1_SIZE; + } + PMAP_UNLOCK(pmap); + } +} + +/* + * Do the things to protect a 1mpage in a process. + */ +static boolean_t +pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, + vm_prot_t prot) +{ + pt1_entry_t npte1, opte1; + vm_offset_t eva, va; + vm_page_t m; + boolean_t anychanged; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT((sva & PTE1_OFFSET) == 0, + ("%s: sva is not 1mpage aligned", __func__)); + anychanged = FALSE; +retry: + opte1 = npte1 = pte1_load(pte1p); + if (pte1_is_managed(opte1)) { + eva = sva + PTE1_SIZE; + for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); + va < eva; va += PAGE_SIZE, m++) + if (pte1_is_dirty(opte1)) + vm_page_dirty(m); + } + if ((prot & VM_PROT_WRITE) == 0) + npte1 |= PTE1_RO | PTE1_NM; + if ((prot & VM_PROT_EXECUTE) == 0) + npte1 |= PTE1_NX; + + /* + * QQQ: Herein, execute permission is never set. + * It only can be cleared. So, no icache + * syncing is needed. + */ + + if (npte1 != opte1) { + if (!pte1_cmpset(pte1p, opte1, npte1)) + goto retry; + if (pte1_is_global(opte1)) + tlb_flush(sva); + else + anychanged = TRUE; + } + return (anychanged); +} + +/* + * Set the physical protection on the + * specified range of this map as requested. + */ +void +pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) +{ + boolean_t anychanged, pv_lists_locked; + vm_offset_t nextva; + pt1_entry_t *pte1p, pte1; + pt2_entry_t *pte2p, opte2, npte2; + + KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); + if (prot == VM_PROT_NONE) { + pmap_remove(pmap, sva, eva); + return; + } + + if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == + (VM_PROT_WRITE | VM_PROT_EXECUTE)) + return; + + if (pmap_is_current(pmap)) + pv_lists_locked = FALSE; + else { + pv_lists_locked = TRUE; +resume: + rw_wlock(&pvh_global_lock); + sched_pin(); + } + anychanged = FALSE; + + PMAP_LOCK(pmap); + for (; sva < eva; sva = nextva) { + /* + * Calculate address for next L2 page table. + */ + nextva = pte1_trunc(sva + PTE1_SIZE); + if (nextva < sva) + nextva = eva; + + pte1p = pmap_pte1(pmap, sva); + pte1 = pte1_load(pte1p); + + /* + * Weed out invalid mappings. Note: we assume that L1 page + * page table is always allocated, and in kernel virtual. + */ + if (pte1 == 0) + continue; + + if (pte1_is_section(pte1)) { + /* + * Are we protecting the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + PTE1_SIZE == nextva && eva >= nextva) { + /* + * The TLB entry for global mapping is + * invalidated by pmap_protect_pte1(). + */ + if (pmap_protect_pte1(pmap, pte1p, sva, prot)) + anychanged = TRUE; + continue; + } else { + if (!pv_lists_locked) { + pv_lists_locked = TRUE; + if (!rw_try_wlock(&pvh_global_lock)) { + if (anychanged) + pmap_tlb_flush_ng(pmap); + PMAP_UNLOCK(pmap); + goto resume; + } + sched_pin(); + } + if (!pmap_demote_pte1(pmap, pte1p, sva)) { + /* + * The large page mapping + * was destroyed. + */ + continue; + } +#ifdef INVARIANTS + else { + /* Update pte1 after demotion */ + pte1 = pte1_load(pte1p); + } +#endif + } + } + + KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" + " is not link", __func__, pmap, sva, pte1, pte1p)); + + /* + * Limit our scan to either the end of the va represented + * by the current L2 page table page, or to the end of the + * range being protected. + */ + if (nextva > eva) + nextva = eva; + + for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, + sva += PAGE_SIZE) { + vm_page_t m; +retry: + opte2 = npte2 = pte2_load(pte2p); + if (!pte2_is_valid(opte2)) + continue; + + if ((prot & VM_PROT_WRITE) == 0) { + if (pte2_is_managed(opte2) && + pte2_is_dirty(opte2)) { + m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); + vm_page_dirty(m); + } + npte2 |= PTE2_RO | PTE2_NM; + } + + if ((prot & VM_PROT_EXECUTE) == 0) + npte2 |= PTE2_NX; + + /* + * QQQ: Herein, execute permission is never set. + * It only can be cleared. So, no icache + * syncing is needed. + */ + + if (npte2 != opte2) { + + if (!pte2_cmpset(pte2p, opte2, npte2)) + goto retry; + + if (pte2_is_global(opte2)) + tlb_flush(sva); + else + anychanged = TRUE; + } + } + } + if (anychanged) + pmap_tlb_flush_ng(pmap); + if (pv_lists_locked) { + sched_unpin(); + rw_wunlock(&pvh_global_lock); + } + PMAP_UNLOCK(pmap); +} + +/* + * pmap_pvh_wired_mappings: + * + * Return the updated number "count" of managed mappings that are wired. + */ +static int +pmap_pvh_wired_mappings(struct md_page *pvh, int count) +{ + pmap_t pmap; + pt1_entry_t pte1; + pt2_entry_t pte2; + pv_entry_t pv; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + sched_pin(); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); + if (pte1_is_section(pte1)) { + if (pte1_is_wired(pte1)) + count++; + } else { + KASSERT(pte1_is_link(pte1), + ("%s: pte1 %#x is not link", __func__, pte1)); + pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); + if (pte2_is_wired(pte2)) + count++; + } + PMAP_UNLOCK(pmap); + } + sched_unpin(); + return (count); +} + +/* + * pmap_page_wired_mappings: + * + * Return the number of managed mappings to the given physical page + * that are wired. + */ +int +pmap_page_wired_mappings(vm_page_t m) +{ + int count; + + count = 0; + if ((m->oflags & VPO_UNMANAGED) != 0) + return (count); + rw_wlock(&pvh_global_lock); + count = pmap_pvh_wired_mappings(&m->md, count); + if ((m->flags & PG_FICTITIOUS) == 0) { + count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), + count); + } + rw_wunlock(&pvh_global_lock); + return (count); +} + +/* + * Returns TRUE if any of the given mappings were used to modify + * physical memory. Otherwise, returns FALSE. Both page and 1mpage + * mappings are supported. + */ +static boolean_t +pmap_is_modified_pvh(struct md_page *pvh) +{ + pv_entry_t pv; + pt1_entry_t pte1; + pt2_entry_t pte2; + pmap_t pmap; + boolean_t rv; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + rv = FALSE; + sched_pin(); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); + if (pte1_is_section(pte1)) { + rv = pte1_is_dirty(pte1); + } else { + KASSERT(pte1_is_link(pte1), + ("%s: pte1 %#x is not link", __func__, pte1)); + pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); + rv = pte2_is_dirty(pte2); + } + PMAP_UNLOCK(pmap); + if (rv) + break; + } + sched_unpin(); + return (rv); +} + +/* + * pmap_is_modified: + * + * Return whether or not the specified physical page was modified + * in any physical maps. + */ +boolean_t +pmap_is_modified(vm_page_t m) +{ + boolean_t rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("%s: page %p is not managed", __func__, m)); + + /* + * If the page is not exclusive busied, then PGA_WRITEABLE cannot be + * concurrently set while the object is locked. Thus, if PGA_WRITEABLE + * is clear, no PTE2s can have PG_M set. + */ + VM_OBJECT_ASSERT_WLOCKED(m->object); + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) + return (FALSE); + rw_wlock(&pvh_global_lock); + rv = pmap_is_modified_pvh(&m->md) || + ((m->flags & PG_FICTITIOUS) == 0 && + pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); + rw_wunlock(&pvh_global_lock); + return (rv); +} + +/* + * pmap_is_prefaultable: + * + * Return whether or not the specified virtual address is eligible + * for prefault. + */ +boolean_t +pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) +{ + pt1_entry_t pte1; + pt2_entry_t pte2; + boolean_t rv; + + rv = FALSE; + PMAP_LOCK(pmap); + pte1 = pte1_load(pmap_pte1(pmap, addr)); + if (pte1_is_link(pte1)) { + pte2 = pte2_load(pt2map_entry(addr)); + rv = !pte2_is_valid(pte2) ; + } + PMAP_UNLOCK(pmap); + return (rv); +} + +/* + * Returns TRUE if any of the given mappings were referenced and FALSE + * otherwise. Both page and 1mpage mappings are supported. + */ +static boolean_t +pmap_is_referenced_pvh(struct md_page *pvh) +{ + + pv_entry_t pv; + pt1_entry_t pte1; + pt2_entry_t pte2; + pmap_t pmap; + boolean_t rv; + + rw_assert(&pvh_global_lock, RA_WLOCKED); + rv = FALSE; + sched_pin(); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); + if (pte1_is_section(pte1)) { + rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); + } else { + pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); + rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); + } + PMAP_UNLOCK(pmap); + if (rv) + break; + } + sched_unpin(); + return (rv); +} + +/* + * pmap_is_referenced: + * + * Return whether or not the specified physical page was referenced + * in any physical maps. + */ +boolean_t +pmap_is_referenced(vm_page_t m) +{ + boolean_t rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("%s: page %p is not managed", __func__, m)); + rw_wlock(&pvh_global_lock); + rv = pmap_is_referenced_pvh(&m->md) || + ((m->flags & PG_FICTITIOUS) == 0 && + pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); + rw_wunlock(&pvh_global_lock); + return (rv); +} + +#define PMAP_TS_REFERENCED_MAX 5 + +/* + * pmap_ts_referenced: + * + * Return a count of reference bits for a page, clearing those bits. + * It is not necessary for every reference bit to be cleared, but it + * is necessary that 0 only be returned when there are truly no + * reference bits set. + * + * XXX: The exact number of bits to check and clear is a matter that + * should be tested and standardized at some point in the future for + * optimal aging of shared pages. + */ +int +pmap_ts_referenced(vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv, pvf; + pmap_t pmap; + pt1_entry_t *pte1p, opte1; + pt2_entry_t *pte2p; + vm_paddr_t pa; + int rtval = 0; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("%s: page %p is not managed", __func__, m)); + pa = VM_PAGE_TO_PHYS(m); + pvh = pa_to_pvh(pa); + rw_wlock(&pvh_global_lock); + sched_pin(); + if ((m->flags & PG_FICTITIOUS) != 0 || + (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) + goto small_mappings; + pv = pvf; + do { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte1p = pmap_pte1(pmap, pv->pv_va); + opte1 = pte1_load(pte1p); + if ((opte1 & PTE1_A) != 0) { + /* + * Since this reference bit is shared by 256 4KB pages, + * it should not be cleared every time it is tested. + * Apply a simple "hash" function on the physical page + * number, the virtual section number, and the pmap + * address to select one 4KB page out of the 256 + * on which testing the reference bit will result + * in clearing that bit. This function is designed + * to avoid the selection of the same 4KB page + * for every 1MB page mapping. + * + * On demotion, a mapping that hasn't been referenced + * is simply destroyed. To avoid the possibility of a + * subsequent page fault on a demoted wired mapping, + * always leave its reference bit set. Moreover, + * since the section is wired, the current state of + * its reference bit won't affect page replacement. + */ + if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ + (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && + !pte1_is_wired(opte1)) { + pte1_clear_bit(pte1p, PTE1_A); + pmap_tlb_flush(pmap, pv->pv_va); + } + rtval++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (TAILQ_NEXT(pv, pv_next) != NULL) { + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); + } + if (rtval >= PMAP_TS_REFERENCED_MAX) + goto out; + } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); +small_mappings: + if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) + goto out; + pv = pvf; + do { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte1p = pmap_pte1(pmap, pv->pv_va); + KASSERT(pte1_is_link(pte1_load(pte1p)), + ("%s: not found a link in page %p's pv list", __func__, m)); + + pte2p = pmap_pte2_quick(pmap, pv->pv_va); + if ((pte2_load(pte2p) & PTE2_A) != 0) { + pte2_clear_bit(pte2p, PTE2_A); + pmap_tlb_flush(pmap, pv->pv_va); + rtval++; + } + PMAP_UNLOCK(pmap); + /* Rotate the PV list if it has more than one entry. */ + if (TAILQ_NEXT(pv, pv_next) != NULL) { + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + } + } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < + PMAP_TS_REFERENCED_MAX); +out: + sched_unpin(); + rw_wunlock(&pvh_global_lock); + return (rtval); +} + +/* + * Clear the wired attribute from the mappings for the specified range of + * addresses in the given pmap. Every valid mapping within that range + * must have the wired attribute set. In contrast, invalid mappings + * cannot have the wired attribute set, so they are ignored. + * + * The wired attribute of the page table entry is not a hardware feature, + * so there is no need to invalidate any TLB entries. + */ +void +pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + vm_offset_t nextva; + pt1_entry_t *pte1p, pte1; + pt2_entry_t *pte2p, pte2; + boolean_t pv_lists_locked; + + if (pmap_is_current(pmap)) + pv_lists_locked = FALSE; + else { + pv_lists_locked = TRUE; +resume: + rw_wlock(&pvh_global_lock); + sched_pin(); + } + PMAP_LOCK(pmap); + for (; sva < eva; sva = nextva) { + nextva = pte1_trunc(sva + PTE1_SIZE); + if (nextva < sva) + nextva = eva; + + pte1p = pmap_pte1(pmap, sva); + pte1 = pte1_load(pte1p); + + /* + * Weed out invalid mappings. Note: we assume that L1 page + * page table is always allocated, and in kernel virtual. + */ + if (pte1 == 0) + continue; + + if (pte1_is_section(pte1)) { + if (!pte1_is_wired(pte1)) + panic("%s: pte1 %#x not wired", __func__, pte1); + + /* + * Are we unwiring the entire large page? If not, + * demote the mapping and fall through. + */ + if (sva + PTE1_SIZE == nextva && eva >= nextva) { + pte1_clear_bit(pte1p, PTE1_W); + pmap->pm_stats.wired_count -= PTE1_SIZE / + PAGE_SIZE; + continue; + } else { + if (!pv_lists_locked) { + pv_lists_locked = TRUE; + if (!rw_try_wlock(&pvh_global_lock)) { + PMAP_UNLOCK(pmap); + /* Repeat sva. */ + goto resume; + } + sched_pin(); + } + if (!pmap_demote_pte1(pmap, pte1p, sva)) + panic("%s: demotion failed", __func__); +#ifdef INVARIANTS + else { + /* Update pte1 after demotion */ + pte1 = pte1_load(pte1p); + } +#endif + } + } + + KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" + " is not link", __func__, pmap, sva, pte1, pte1p)); + + /* + * Limit our scan to either the end of the va represented + * by the current L2 page table page, or to the end of the + * range being protected. + */ + if (nextva > eva) + nextva = eva; + + for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, + sva += PAGE_SIZE) { + pte2 = pte2_load(pte2p); + if (!pte2_is_valid(pte2)) + continue; + if (!pte2_is_wired(pte2)) + panic("%s: pte2 %#x is missing PTE2_W", + __func__, pte2); + + /* + * PTE2_W must be cleared atomically. Although the pmap + * lock synchronizes access to PTE2_W, another processor + * could be changing PTE2_NM and/or PTE2_A concurrently. + */ + pte2_clear_bit(pte2p, PTE2_W); + pmap->pm_stats.wired_count--; + } + } + if (pv_lists_locked) { + sched_unpin(); + rw_wunlock(&pvh_global_lock); + } + PMAP_UNLOCK(pmap); +} + +/* + * Clear the write and modified bits in each of the given page's mappings. + */ +void +pmap_remove_write(vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t next_pv, pv; + pmap_t pmap; + pt1_entry_t *pte1p; + pt2_entry_t *pte2p, opte2; + vm_offset_t va; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("%s: page %p is not managed", __func__, m)); + + /* + * If the page is not exclusive busied, then PGA_WRITEABLE cannot be + * set by another thread while the object is locked. Thus, + * if PGA_WRITEABLE is clear, no page table entries need updating. + */ + VM_OBJECT_ASSERT_WLOCKED(m->object); + if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) + return; + rw_wlock(&pvh_global_lock); + sched_pin(); + if ((m->flags & PG_FICTITIOUS) != 0) + goto small_mappings; + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte1p = pmap_pte1(pmap, va); + if (!(pte1_load(pte1p) & PTE1_RO)) + (void)pmap_demote_pte1(pmap, pte1p, va); + PMAP_UNLOCK(pmap); + } +small_mappings: + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte1p = pmap_pte1(pmap, pv->pv_va); + KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" + " a section in page %p's pv list", __func__, m)); + pte2p = pmap_pte2_quick(pmap, pv->pv_va); +retry: + opte2 = pte2_load(pte2p); + if (!(opte2 & PTE2_RO)) { + if (!pte2_cmpset(pte2p, opte2, + opte2 | (PTE2_RO | PTE2_NM))) + goto retry; + if (pte2_is_dirty(opte2)) + vm_page_dirty(m); + pmap_tlb_flush(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + vm_page_aflag_clear(m, PGA_WRITEABLE); + sched_unpin(); + rw_wunlock(&pvh_global_lock); +} + +/* + * Apply the given advice to the specified range of addresses within the + * given pmap. Depending on the advice, clear the referenced and/or + * modified flags in each mapping and set the mapped page's dirty field. + */ +void +pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) +{ + pt1_entry_t *pte1p, opte1; + pt2_entry_t *pte2p, pte2; + vm_offset_t pdnxt; + vm_page_t m; + boolean_t anychanged, pv_lists_locked; + + if (advice != MADV_DONTNEED && advice != MADV_FREE) + return; + if (pmap_is_current(pmap)) + pv_lists_locked = FALSE; + else { + pv_lists_locked = TRUE; +resume: + rw_wlock(&pvh_global_lock); + sched_pin(); + } + anychanged = FALSE; + PMAP_LOCK(pmap); + for (; sva < eva; sva = pdnxt) { + pdnxt = pte1_trunc(sva + PTE1_SIZE); + if (pdnxt < sva) + pdnxt = eva; + pte1p = pmap_pte1(pmap, sva); + opte1 = pte1_load(pte1p); + if (!pte1_is_valid(opte1)) /* XXX */ + continue; + else if (pte1_is_section(opte1)) { + if (!pte1_is_managed(opte1)) + continue; + if (!pv_lists_locked) { + pv_lists_locked = TRUE; + if (!rw_try_wlock(&pvh_global_lock)) { + if (anychanged) + pmap_tlb_flush_ng(pmap); + PMAP_UNLOCK(pmap); + goto resume; + } + sched_pin(); + } + if (!pmap_demote_pte1(pmap, pte1p, sva)) { + /* + * The large page mapping was destroyed. + */ + continue; + } + + /* + * Unless the page mappings are wired, remove the + * mapping to a single page so that a subsequent + * access may repromote. Since the underlying L2 page + * table is fully populated, this removal never + * frees a L2 page table page. + */ + if (!pte1_is_wired(opte1)) { + pte2p = pmap_pte2_quick(pmap, sva); + KASSERT(pte2_is_valid(pte2_load(pte2p)), + ("%s: invalid PTE2", __func__)); + pmap_remove_pte2(pmap, pte2p, sva, NULL); + anychanged = TRUE; + } + } + if (pdnxt > eva) + pdnxt = eva; + for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, + sva += PAGE_SIZE) { + pte2 = pte2_load(pte2p); + if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) + continue; + else if (pte2_is_dirty(pte2)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() + * can be avoided by making the page + * dirty now. + */ + m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); + vm_page_dirty(m); + } + pte2_set_bit(pte2p, PTE2_NM); + pte2_clear_bit(pte2p, PTE2_A); + } else if ((pte2 & PTE2_A) != 0) + pte2_clear_bit(pte2p, PTE2_A); + else + continue; + if (pte2_is_global(pte2)) + tlb_flush(sva); + else + anychanged = TRUE; + } + } + if (anychanged) + pmap_tlb_flush_ng(pmap); + if (pv_lists_locked) { + sched_unpin(); + rw_wunlock(&pvh_global_lock); + } + PMAP_UNLOCK(pmap); +} + +/* + * Clear the modify bits on the specified physical page. + */ +void +pmap_clear_modify(vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t next_pv, pv; + pmap_t pmap; + pt1_entry_t *pte1p, opte1; + pt2_entry_t *pte2p, opte2; + vm_offset_t va; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("%s: page %p is not managed", __func__, m)); + VM_OBJECT_ASSERT_WLOCKED(m->object); + KASSERT(!vm_page_xbusied(m), + ("%s: page %p is exclusive busy", __func__, m)); + + /* + * If the page is not PGA_WRITEABLE, then no PTE2s can have PTE2_NM + * cleared. If the object containing the page is locked and the page + * is not exclusive busied, then PGA_WRITEABLE cannot be concurrently + * set. + */ + if ((m->flags & PGA_WRITEABLE) == 0) + return; + rw_wlock(&pvh_global_lock); + sched_pin(); + if ((m->flags & PG_FICTITIOUS) != 0) + goto small_mappings; + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { + va = pv->pv_va; + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte1p = pmap_pte1(pmap, va); + opte1 = pte1_load(pte1p); + if (!(opte1 & PTE1_RO)) { + if (pmap_demote_pte1(pmap, pte1p, va) && + !pte1_is_wired(opte1)) { + /* + * Write protect the mapping to a + * single page so that a subsequent + * write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); + pte2p = pmap_pte2_quick(pmap, va); + opte2 = pte2_load(pte2p); + if ((opte2 & PTE2_V)) { + pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); + vm_page_dirty(m); + pmap_tlb_flush(pmap, va); + } + } + } + PMAP_UNLOCK(pmap); + } +small_mappings: + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + PMAP_LOCK(pmap); + pte1p = pmap_pte1(pmap, pv->pv_va); + KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" + " a section in page %p's pv list", __func__, m)); + pte2p = pmap_pte2_quick(pmap, pv->pv_va); + if (pte2_is_dirty(pte2_load(pte2p))) { + pte2_set_bit(pte2p, PTE2_NM); + pmap_tlb_flush(pmap, pv->pv_va); + } + PMAP_UNLOCK(pmap); + } + sched_unpin(); + rw_wunlock(&pvh_global_lock); +} + + +/* + * Sets the memory attribute for the specified page. + */ +void +pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) +{ + struct sysmaps *sysmaps; + vm_memattr_t oma; + vm_paddr_t pa; + vm_offset_t va; + + oma = m->md.pat_mode; + m->md.pat_mode = ma; + + CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d, phys: 0x%08X", __func__, m, VM_PAGE_TO_PHYS(m), oma, ma); + if ((m->flags & PG_FICTITIOUS) != 0) + return; +#if 0 + /* + * If "m" is a normal page, flush it from the cache. + * + * First, try to find an existing mapping of the page by sf + * buffer. sf_buf_invalidate_cache() modifies mapping and + * flushes the cache. + */ + if (sf_buf_invalidate_cache(m, oma)) + return; +#endif + /* + * If page is not mapped by sf buffer, map the page + * transient and do invalidation. + */ + if (ma != oma) { + pa = VM_PAGE_TO_PHYS(m); + sched_pin(); + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP2) + panic("%s: CMAP2 busy", __func__); + pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(pa, PTE2_AP_KRW, ma)); + va = (vm_offset_t)sysmaps->CADDR2; + tlb_flush_local(va); + dcache_wbinv_poc(va, pa, PAGE_SIZE); + pte2_clear(sysmaps->CMAP2); + sched_unpin(); + mtx_unlock(&sysmaps->lock); + } +} + +/* + * Miscellaneous support routines follow + */ + +/* + * Returns TRUE if the given page is mapped individually or as part of + * a 1mpage. Otherwise, returns FALSE. + */ +boolean_t +pmap_page_is_mapped(vm_page_t m) +{ + boolean_t rv; + + if ((m->oflags & VPO_UNMANAGED) != 0) + return (FALSE); + rw_wlock(&pvh_global_lock); + rv = !TAILQ_EMPTY(&m->md.pv_list) || + ((m->flags & PG_FICTITIOUS) == 0 && + !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); + rw_wunlock(&pvh_global_lock); + return (rv); +} + +/* + * Returns true if the pmap's pv is one of the first + * 16 pvs linked to from this page. This count may + * be changed upwards or downwards in the future; it + * is only necessary that true be returned for a small + * subset of pmaps for proper page aging. + */ +boolean_t +pmap_page_exists_quick(pmap_t pmap, vm_page_t m) +{ + struct md_page *pvh; + pv_entry_t pv; + int loops = 0; + boolean_t rv; + + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("%s: page %p is not managed", __func__, m)); + rv = FALSE; + rw_wlock(&pvh_global_lock); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + if (PV_PMAP(pv) == pmap) { + rv = TRUE; + break; + } + loops++; + if (loops >= 16) + break; + } + } + rw_wunlock(&pvh_global_lock); + return (rv); +} + +/* + * pmap_zero_page zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + */ +void +pmap_zero_page(vm_page_t m) +{ + struct sysmaps *sysmaps; + + sched_pin(); + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (pte2_load(sysmaps->CMAP2) != 0) + panic("%s: CMAP2 busy", __func__); + pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, + m->md.pat_mode)); + tlb_flush_local((vm_offset_t)sysmaps->CADDR2); + pagezero(sysmaps->CADDR2); + pte2_clear(sysmaps->CMAP2); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * pmap_zero_page_area zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. + * + * off and size may not cover an area beyond a single hardware page. + */ +void +pmap_zero_page_area(vm_page_t m, int off, int size) +{ + struct sysmaps *sysmaps; + + sched_pin(); + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (pte2_load(sysmaps->CMAP2) != 0) + panic("%s: CMAP2 busy", __func__); + pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, + m->md.pat_mode)); + tlb_flush_local((vm_offset_t)sysmaps->CADDR2); + if (off == 0 && size == PAGE_SIZE) + pagezero(sysmaps->CADDR2); + else + bzero(sysmaps->CADDR2 + off, size); + pte2_clear(sysmaps->CMAP2); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * pmap_zero_page_idle zeros the specified hardware page by mapping + * the page into KVM and using bzero to clear its contents. This + * is intended to be called from the vm_pagezero process only and + * outside of Giant. + */ +void +pmap_zero_page_idle(vm_page_t m) +{ + + if (pte2_load(CMAP3) != 0) + panic("%s: CMAP3 busy", __func__); + sched_pin(); + pte2_store(CMAP3, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, + m->md.pat_mode)); + tlb_flush_local((vm_offset_t)CADDR3); + pagezero(CADDR3); + pte2_clear(CMAP3); + sched_unpin(); +} + +/* + * pmap_copy_page copies the specified (machine independent) + * page by mapping the page into virtual memory and using + * bcopy to copy the page, one machine dependent page at a + * time. + */ +void +pmap_copy_page(vm_page_t src, vm_page_t dst) +{ + struct sysmaps *sysmaps; + + sched_pin(); + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (pte2_load(sysmaps->CMAP1) != 0) + panic("%s: CMAP1 busy", __func__); + if (pte2_load(sysmaps->CMAP2) != 0) + panic("%s: CMAP2 busy", __func__); + pte2_store(sysmaps->CMAP1, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), + PTE2_AP_KR | PTE2_NM, src->md.pat_mode)); + tlb_flush_local((vm_offset_t)sysmaps->CADDR1); + pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), + PTE2_AP_KRW, dst->md.pat_mode)); + tlb_flush_local((vm_offset_t)sysmaps->CADDR2); + bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); + pte2_clear(sysmaps->CMAP1); + pte2_clear(sysmaps->CMAP2); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +int unmapped_buf_allowed = 1; + +void +pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], + vm_offset_t b_offset, int xfersize) +{ + struct sysmaps *sysmaps; + vm_page_t a_pg, b_pg; + char *a_cp, *b_cp; + vm_offset_t a_pg_offset, b_pg_offset; + int cnt; + + sched_pin(); + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP1 != 0) + panic("pmap_copy_pages: CMAP1 busy"); + if (*sysmaps->CMAP2 != 0) + panic("pmap_copy_pages: CMAP2 busy"); + while (xfersize > 0) { + a_pg = ma[a_offset >> PAGE_SHIFT]; + a_pg_offset = a_offset & PAGE_MASK; + cnt = min(xfersize, PAGE_SIZE - a_pg_offset); + b_pg = mb[b_offset >> PAGE_SHIFT]; + b_pg_offset = b_offset & PAGE_MASK; + cnt = min(cnt, PAGE_SIZE - b_pg_offset); + pte2_store(sysmaps->CMAP1, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), + PTE2_AP_KR | PTE2_NM, a_pg->md.pat_mode)); + tlb_flush_local((vm_offset_t)sysmaps->CADDR1); + pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), + PTE2_AP_KRW, b_pg->md.pat_mode)); + tlb_flush_local((vm_offset_t)sysmaps->CADDR2); + a_cp = sysmaps->CADDR1 + a_pg_offset; + b_cp = sysmaps->CADDR2 + b_pg_offset; + bcopy(a_cp, b_cp, cnt); + a_offset += cnt; + b_offset += cnt; + xfersize -= cnt; + } + pte2_clear(sysmaps->CMAP1); + pte2_clear(sysmaps->CMAP2); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * Copy the range specified by src_addr/len + * from the source map to the range dst_addr/len + * in the destination map. + * + * This routine is only advisory and need not do anything. + */ +void +pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, + vm_offset_t src_addr) +{ + struct spglist free; + vm_offset_t addr; + vm_offset_t end_addr = src_addr + len; + vm_offset_t nextva; + + if (dst_addr != src_addr) + return; + + if (!pmap_is_current(src_pmap)) + return; + + rw_wlock(&pvh_global_lock); + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + sched_pin(); + for (addr = src_addr; addr < end_addr; addr = nextva) { + pt2_entry_t *src_pte2p, *dst_pte2p; + vm_page_t dst_mpt2pg, src_mpt2pg; + pt1_entry_t src_pte1; + u_int pte1_idx; + + KASSERT(addr < VM_MAXUSER_ADDRESS, + ("%s: invalid to pmap_copy page tables", __func__)); + + nextva = pte1_trunc(addr + PTE1_SIZE); + if (nextva < addr) + nextva = end_addr; + + pte1_idx = pte1_index(addr); + src_pte1 = src_pmap->pm_pt1[pte1_idx]; + if (pte1_is_section(src_pte1)) { + if ((addr & PTE1_OFFSET) != 0 || + (addr + PTE1_SIZE) > end_addr) + continue; + if (dst_pmap->pm_pt1[pte1_idx] == 0 && + (!pte1_is_managed(src_pte1) || + pmap_pv_insert_pte1(dst_pmap, addr, + pte1_pa(src_pte1)))) { + dst_pmap->pm_pt1[pte1_idx] = src_pte1 & + ~PTE1_W; + dst_pmap->pm_stats.resident_count += + PTE1_SIZE / PAGE_SIZE; + } + continue; + } else if (!pte1_is_link(src_pte1)) + continue; + + src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); + + /* + * We leave PT2s to be linked from PT1 even if they are not + * referenced until all PT2s in a page are without reference. + * + * QQQ: It could be changed ... + */ +#if 0 /* single_pt2_link_is_cleared */ + KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, + ("%s: source page table page is unused", __func__)); +#else + if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) + continue; +#endif + if (nextva > end_addr) + nextva = end_addr; + + src_pte2p = pt2map_entry(addr); + while (addr < nextva) { + pt2_entry_t temp_pte2; + temp_pte2 = pte2_load(src_pte2p); + /* + * we only virtual copy managed pages + */ + if (pte2_is_managed(temp_pte2)) { + dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, + PMAP_ENTER_NOSLEEP); + if (dst_mpt2pg == NULL) + goto out; + dst_pte2p = pmap_pte2_quick(dst_pmap, addr); + if (!pte2_is_valid(pte2_load(dst_pte2p)) && + pmap_try_insert_pv_entry(dst_pmap, addr, + PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { + /* + * Clear the wired, modified, and + * accessed (referenced) bits + * during the copy. + */ + temp_pte2 &= ~(PTE2_W | PTE2_A); + temp_pte2 |= PTE2_NM; + pte2_store(dst_pte2p, temp_pte2); + dst_pmap->pm_stats.resident_count++; + } else { + SLIST_INIT(&free); + if (pmap_unwire_pt2(dst_pmap, addr, + dst_mpt2pg, &free)) { + pmap_tlb_flush(dst_pmap, addr); + pmap_free_zero_pages(&free); + } + goto out; + } + if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= + pt2_wirecount_get(src_mpt2pg, pte1_idx)) + break; + } + addr += PAGE_SIZE; + src_pte2p++; + } + } +out: + sched_unpin(); + rw_wunlock(&pvh_global_lock); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); +} + +/* + * Increase the starting virtual address of the given mapping if a + * different alignment might result in more section mappings. + */ +void +pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, + vm_offset_t *addr, vm_size_t size) +{ + vm_offset_t pte1_offset; + + if (size < PTE1_SIZE) + return; + if (object != NULL && (object->flags & OBJ_COLORED) != 0) + offset += ptoa(object->pg_color); + pte1_offset = offset & PTE1_OFFSET; + if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || + (*addr & PTE1_OFFSET) == pte1_offset) + return; + if ((*addr & PTE1_OFFSET) < pte1_offset) + *addr = pte1_trunc(*addr) + pte1_offset; + else + *addr = pte1_roundup(*addr) + pte1_offset; +} + +void +pmap_activate(struct thread *td) +{ + pmap_t pmap, oldpmap; + u_int cpuid, ttb; + + PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); + + critical_enter(); + pmap = vmspace_pmap(td->td_proc->p_vmspace); + oldpmap = PCPU_GET(curpmap); + cpuid = PCPU_GET(cpuid); + +#if defined(SMP) + CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); + CPU_SET_ATOMIC(cpuid, &pmap->pm_active); +#else + CPU_CLR(cpuid, &oldpmap->pm_active); + CPU_SET(cpuid, &pmap->pm_active); +#endif + + ttb = pmap_ttb_get(pmap); + + /* + * pmap_activate is for the current thread on the current cpu + */ + td->td_pcb->pcb_pagedir = ttb; + cp15_ttbr_set(ttb); + PCPU_SET(curpmap, pmap); + critical_exit(); +} + +int +pmap_dmap_iscurrent(pmap_t pmap) +{ + + return (pmap_is_current(pmap)); +} + +/* + * Perform the pmap work for mincore. + */ +int +pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) +{ + pt1_entry_t *pte1p, pte1; + pt2_entry_t *pte2p, pte2; + vm_paddr_t pa; + boolean_t managed; + int val; + + PMAP_LOCK(pmap); +retry: + pte1p = pmap_pte1(pmap, addr); + pte1 = pte1_load(pte1p); + if (pte1_is_section(pte1)) { + pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); + managed = pte1_is_managed(pte1); + val = MINCORE_SUPER | MINCORE_INCORE; + if (pte1_is_dirty(pte1)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + if (pte1 & PTE1_A) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + } else if (pte1_is_link(pte1)) { + pte2p = pmap_pte2(pmap, addr); + pte2 = pte2_load(pte2p); + pmap_pte2_release(pte2p); + pa = pte2_pa(pte2); + managed = pte2_is_managed(pte2); + val = MINCORE_INCORE; + if (pte2_is_dirty(pte2)) + val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; + if (pte2 & PTE2_A) + val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; + } else { + managed = FALSE; + val = 0; + } + if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != + (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { + /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ + if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) + goto retry; + } else + PA_UNLOCK_COND(*locked_pa); + PMAP_UNLOCK(pmap); + return (val); +} + +void +pmap_kenter_device(vm_offset_t va, vm_paddr_t pa) +{ + + pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEVICE); + tlb_flush(va); +} + +void +pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) +{ + + pcb->pcb_pagedir = pmap_ttb_get(pmap); +} + + +/* + * Clean L1 data cache range on a single page, which is not mapped yet. + */ +static void +pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) +{ + struct sysmaps *sysmaps; + vm_offset_t va; + + KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, + ("%s: not on single page", __func__)); + + sched_pin(); + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (*sysmaps->CMAP3) + panic("%s: CMAP3 busy", __func__); + pte2_store(sysmaps->CMAP3, PTE2_KERN_NG(pa, PTE2_AP_KRW, ma)); + va = (vm_offset_t)sysmaps->CADDR3; + tlb_flush_local(va); + dcache_wb_pou(va, size); + pte2_clear(sysmaps->CMAP3); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +/* + * Sync instruction cache range which is not mapped yet. + */ +void +cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) +{ + uint32_t len, offset; + vm_page_t m; + + /* Write back d-cache on given address range. */ + offset = pa & PAGE_MASK; + for ( ; size != 0; size -= len, pa += len, offset = 0) { + len = min(PAGE_SIZE - offset, size); + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", + __func__, pa)); + pmap_dcache_wb_pou(pa, len, m->md.pat_mode); + } + /* + * I-cache is VIPT. Only way how to flush all virtual mappings + * on given physical address is to invalidate all i-cache. + */ + icache_inv_all(); +} + +void +pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) +{ + + /* Write back d-cache on given address range. */ + if (va >= VM_MIN_KERNEL_ADDRESS) { + dcache_wb_pou(va, size); + } else { + uint32_t len, offset; + vm_paddr_t pa; + vm_page_t m; + + offset = va & PAGE_MASK; + for ( ; size != 0; size -= len, va += len, offset = 0) { + pa = pmap_extract(pmap, va); /* offset is preserved */ + len = min(PAGE_SIZE - offset, size); + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", + __func__, pa)); + pmap_dcache_wb_pou(pa, len, m->md.pat_mode); + } + } + /* + * I-cache is VIPT. Only way how to flush all virtual mappings + * on given physical address is to invalidate all i-cache. + */ + icache_inv_all(); +} + +/* + * The implementation of pmap_fault() uses IN_RANGE2() macro which + * depends on the fact that given range size is a power of 2. + */ +CTASSERT(powerof2(NB_IN_PT1)); +CTASSERT(powerof2(PT2MAP_SIZE)); + +#define IN_RANGE2(addr, start, size) \ + ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) + +/* + * Handle access and R/W emulation faults. + */ +int +pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, int usermode) +{ + pt1_entry_t *pte1p, pte1; + pt2_entry_t *pte2p, pte2; + + if (pmap == NULL) + pmap = kernel_pmap; + + /* + * In kernel, we should never get abort with FAR which is in range of + * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here + * and print out a useful abort message and even get to the debugger + * otherwise it likely ends with never ending loop of aborts. + */ + if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { + /* + * All L1 tables should always be mapped and present. + * However, we check only current one herein. For user mode, + * only permission abort from malicious user is not fatal. + */ + if (!usermode || (idx != FAULT_PERM_L2)) { + CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", + __func__, pmap, pmap->pm_pt1, far); + panic("%s: pm_pt1 abort", __func__); + } + return (EFAULT); + } + if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { + /* + * PT2MAP should be always mapped and present in current + * L1 table. However, only existing L2 tables are mapped + * in PT2MAP. For user mode, only L2 translation abort and + * permission abort from malicious user is not fatal. + */ + if (!usermode || + (idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { + CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", + __func__, pmap, PT2MAP, far); + panic("%s: PT2MAP abort", __func__); + } + return (EFAULT); + } + + /* + * Accesss bits for page and section. Note that the entry + * is not in TLB yet, so TLB flush is not necessary. + * + * QQQ: This is hardware emulation, we do not call userret() + * for aborts from user mode. + * We do not lock PMAP, so cmpset() is a need. Hopefully, + * no one removes the mapping when we are here. + */ + if (idx == FAULT_ACCESS_L2) { + pte2p = pt2map_entry(far); +pte2_seta: + pte2 = pte2_load(pte2p); + if (pte2_is_valid(pte2)) { + if (!pte2_cmpset(pte2p, pte2, pte2 | PTE2_A)) { + goto pte2_seta; + } + return (0); + } + } + if (idx == FAULT_ACCESS_L1) { + pte1p = pmap_pte1(pmap, far); +pte1_seta: + pte1 = pte1_load(pte1p); + if (pte1_is_section(pte1)) { + if (!pte1_cmpset(pte1p, pte1, pte1 | PTE1_A)) { + goto pte1_seta; + } + return (0); + } + } + + /* + * Handle modify bits for page and section. Note that the modify + * bit is emulated by software. So PTEx_RO is software read only + * bit and PTEx_NM flag is real harware read only bit. + * + * QQQ: This is hardware emulation, we do not call userret() + * for aborts from user mode. + * We do not lock PMAP, so cmpset() is a need. Hopefully, + * no one removes the mapping when we are here. + */ + if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { + pte2p = pt2map_entry(far); +pte2_setrw: + pte2 = pte2_load(pte2p); + if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && + (pte2 & PTE2_NM)) { + if (!pte2_cmpset(pte2p, pte2, pte2 & ~PTE2_NM)) { + goto pte2_setrw; + } + tlb_flush(trunc_page(far)); + return (0); + } + } + if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { + pte1p = pmap_pte1(pmap, far); +pte1_setrw: + pte1 = pte1_load(pte1p); + if (pte1_is_section(pte1) && !(pte1 & PTE1_RO) && + (pte1 & PTE1_NM)) { + if (!pte1_cmpset(pte1p, pte1, pte1 & ~PTE1_NM)) { + goto pte1_setrw; + } + tlb_flush(pte1_trunc(far)); + return (0); + } + } + + /* + * QQQ: The previous code, mainly fast handling of access and + * modify bits aborts, could be moved to ASM. Now we are + * starting to deal with not fast aborts. + */ + +#ifdef INVARIANTS + /* + * Read an entry in PT2TAB associated with both pmap and far. + * It's safe because PT2TAB is always mapped. + * + * QQQ: We do not lock PMAP, so false positives could happen if + * the mapping is removed concurrently. + */ + pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); + if (pte2_is_valid(pte2)) { + /* + * Now, when we know that L2 page table is allocated, + * we can use PT2MAP to get L2 page table entry. + */ + pte2 = pte2_load(pt2map_entry(far)); + if (pte2_is_valid(pte2)) { + /* + * If L2 page table entry is valid, make sure that + * L1 page table entry is valid too. Note that we + * leave L2 page entries untouched when promoted. + */ + pte1 = pte1_load(pmap_pte1(pmap, far)); + if (!pte1_is_valid(pte1)) { + panic("%s: missing L1 page entry (%p, %#x)", + __func__, pmap, far); + } + } + } +#endif + return (EAGAIN); +} + +/* !!!! REMOVE !!!! */ +void +pmap_pte_init_mmu_v6(void) +{ +} + +void vector_page_setprot(int p) +{ +} + +#if defined(PMAP_DEBUG) +/* + * Reusing of KVA used in pmap_zero_page function !!! + */ +static void +pmap_zero_page_check(vm_page_t m) +{ + uint32_t *p, *end; + struct sysmaps *sysmaps; + + sched_pin(); + sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; + mtx_lock(&sysmaps->lock); + if (pte2_load(sysmaps->CMAP2) != 0) + panic("%s: CMAP2 busy", __func__); + pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, + m->md.pat_mode)); + tlb_flush_local((vm_offset_t)sysmaps->CADDR2); + end = (uint32_t*)(sysmaps->CADDR2 + PAGE_SIZE); + for (p = (uint32_t*)sysmaps->CADDR2; p < end; p++) + if (*p != 0) + panic("%s: page %p not zero, va: %p", __func__, m, + sysmaps->CADDR2); + pte2_clear(sysmaps->CMAP2); + sched_unpin(); + mtx_unlock(&sysmaps->lock); +} + +int +pmap_pid_dump(int pid) +{ + pmap_t pmap; + struct proc *p; + int npte2 = 0; + int i, j, index; + + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_pid != pid || p->p_vmspace == NULL) + continue; + index = 0; + pmap = vmspace_pmap(p->p_vmspace); + for (i = 0; i < NPTE1_IN_PT1; i++) { + pt1_entry_t pte1; + pt2_entry_t *pte2p, pte2; + vm_offset_t base, va; + vm_paddr_t pa; + vm_page_t m; + + base = i << PTE1_SHIFT; + pte1 = pte1_load(&pmap->pm_pt1[i]); + + if (pte1_is_section(pte1)) { + /* + * QQQ: Do something here! + */ + } else if (pte1_is_link(pte1)) { + for (j = 0; j < NPTE2_IN_PT2; j++) { + va = base + (j << PAGE_SHIFT); + if (va >= VM_MIN_KERNEL_ADDRESS) { + if (index) { + index = 0; + printf("\n"); + } + sx_sunlock(&allproc_lock); + return (npte2); + } + pte2p = pmap_pte2(pmap, va); + pte2 = pte2_load(pte2p); + pmap_pte2_release(pte2p); + if (!pte2_is_valid(pte2)) + continue; + + pa = pte2_pa(pte2); + m = PHYS_TO_VM_PAGE(pa); + printf("va: 0x%x, pa: 0x%x, h: %d, w:" + " %d, f: 0x%x", va, pa, + m->hold_count, m->wire_count, + m->flags); + npte2++; + index++; + if (index >= 2) { + index = 0; + printf("\n"); + } else { + printf(" "); + } + } + } + } + } + sx_sunlock(&allproc_lock); + return (npte2); +} + +/* + * Print address space of pmap. + */ +static void +pads(pmap_t pmap) +{ + int i, j; + vm_paddr_t va; + pt1_entry_t pte1; + pt2_entry_t *pte2p, pte2; + + if (pmap == kernel_pmap) + return; + for (i = 0; i < NPTE1_IN_PT1; i++) { + pte1 = pte1_load(&pmap->pm_pt1[i]); + if (pte1_is_section(pte1)) { + /* + * QQQ: Do something here! + */ + } else if (pte1_is_link(pte1)) { + for (j = 0; j < NPTE2_IN_PT2; j++) { + va = (i << PTE1_SHIFT) + (j << PAGE_SHIFT); + if (pmap == kernel_pmap && va < KERNBASE) + continue; + if (pmap != kernel_pmap && va >= KERNBASE && + (va < UPT2V_MIN_ADDRESS || + va >= UPT2V_MAX_ADDRESS)) + continue; + + pte2p = pmap_pte2(pmap, va); + pte2 = pte2_load(pte2p); + pmap_pte2_release(pte2p); + if (!pte2_is_valid(pte2)) + continue; + printf("%x:%x ", va, pte2); + } + } + } +} + +void +pmap_pvdump(vm_paddr_t pa) +{ + pv_entry_t pv; + pmap_t pmap; + vm_page_t m; + + printf("pa %x", pa); + m = PHYS_TO_VM_PAGE(pa); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + pmap = PV_PMAP(pv); + printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); + pads(pmap); + } + printf(" "); +} +#endif + +#ifdef DDB +static pt2_entry_t * +pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) +{ + pt1_entry_t pte1; + vm_paddr_t pt2pg_pa; + + pte1 = pte1_load(pmap_pte1(pmap, va)); + if (!pte1_is_link(pte1)) + return (NULL); + + if (pmap_is_current(pmap)) + return (pt2map_entry(va)); + + /* Note that L2 page table size is not equal to PAGE_SIZE. */ + pt2pg_pa = trunc_page(pte1_link_pa(pte1)); + if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { + pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); +#ifdef SMP + PMAP3cpu = PCPU_GET(cpuid); +#endif + tlb_flush_local((vm_offset_t)PADDR3); + } +#ifdef SMP + else if (PMAP3cpu != PCPU_GET(cpuid)) { + PMAP3cpu = PCPU_GET(cpuid); + tlb_flush_local((vm_offset_t)PADDR3); + } +#endif + return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); +} + +static void +dump_pmap(pmap_t pmap) +{ + + printf("pmap %p\n", pmap); + printf(" pm_pt1: %p\n", pmap->pm_pt1); + printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); + printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); +} + +DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) +{ + + pmap_t pmap; + LIST_FOREACH(pmap, &allpmaps, pm_list) { + dump_pmap(pmap); + } +} + +static int +pte2_class(pt2_entry_t pte2) +{ + int cls; + + cls = (pte2 >> 2) & 0x03; + cls |= (pte2 >> 4) & 0x04; + return (cls); +} + +static void +dump_section(pmap_t pmap, uint32_t pte1_idx) +{ +} + +static void +dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) +{ + uint32_t i; + vm_offset_t va; + pt2_entry_t *pte2p, pte2; + vm_page_t m; + + va = pte1_idx << PTE1_SHIFT; + pte2p = pmap_pte2_ddb(pmap, va); + for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { + pte2 = pte2_load(pte2p); + if (pte2 == 0) + continue; + if (!pte2_is_valid(pte2)) { + printf(" 0x%08X: 0x%08X", va, pte2); + if (!invalid_ok) + printf(" - not valid !!!"); + printf("\n"); + continue; + } + m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); + printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, + pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); + if (m != NULL) { + printf(" v:%d h:%d w:%d f:0x%04X\n", m->valid, + m->hold_count, m->wire_count, m->flags); + } else { + printf("\n"); + } + } +} + +static __inline boolean_t +is_pv_chunk_space(vm_offset_t va) +{ + + if ((((vm_offset_t)pv_chunkbase) <= va) && + (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) + return (TRUE); + return (FALSE); +} + +DB_SHOW_COMMAND(pmap, pmap_pmap_print) +{ + /* XXX convert args. */ + pmap_t pmap = (pmap_t)addr; + pt1_entry_t pte1; + pt2_entry_t pte2; + vm_offset_t va, eva; + vm_page_t m; + uint32_t i; + boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; + + if (have_addr) { + pmap_t pm; + + LIST_FOREACH(pm, &allpmaps, pm_list) + if (pm == pmap) break; + if (pm == NULL) { + printf("given pmap %p is not in allpmaps list\n", pmap); + return; + } + } else + pmap = PCPU_GET(curpmap); + + eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; + dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ + + printf("pmap: 0x%08X\n", (uint32_t)pmap); + printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); + printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); + + for(i = 0; i < NPTE1_IN_PT1; i++) { + pte1 = pte1_load(&pmap->pm_pt1[i]); + if (pte1 == 0) + continue; + va = i << PTE1_SHIFT; + if (va >= eva) + break; + + if (pte1_is_section(pte1)) { + printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, + !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); + dump_section(pmap, i); + } else if (pte1_is_link(pte1)) { + dump_link_ok = TRUE; + invalid_ok = FALSE; + pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); + m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); + printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", + va, pte1, pte2, m); + if (is_pv_chunk_space(va)) { + printf(" - pv_chunk space"); + if (dump_pv_chunk) + invalid_ok = TRUE; + else + dump_link_ok = FALSE; + } + else if (m != NULL) + printf(" w:%d w2:%u", m->wire_count, + pt2_wirecount_get(m, pte1_index(va))); + if (pte2 == 0) + printf(" !!! pt2tab entry is ZERO"); + else if (pte2_pa(pte1) != pte2_pa(pte2)) + printf(" !!! pt2tab entry is DIFFERENT - m: %p", + PHYS_TO_VM_PAGE(pte2_pa(pte2))); + printf("\n"); + if (dump_link_ok) + dump_link(pmap, i, invalid_ok); + } else + printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); + } +} + +static void +dump_pt2tab(pmap_t pmap) +{ + uint32_t i; + pt2_entry_t pte2; + vm_offset_t va; + vm_paddr_t pa; + vm_page_t m; + + printf("PT2TAB:\n"); + for (i = 0; i < PT2TAB_ENTRIES; i++) { + pte2 = pte2_load(&pmap->pm_pt2tab[i]); + if (!pte2_is_valid(pte2)) + continue; + va = i << PT2TAB_SHIFT; + pa = pte2_pa(pte2); + m = PHYS_TO_VM_PAGE(pa); + printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, + pte2_class(pte2), !!(pte2 & PTE2_S), m); + if (m != NULL) + printf(" , h: %d, w: %d, f: 0x%04X pidx: %lld", + m->hold_count, m->wire_count, m->flags, m->pindex); + printf("\n"); + } +} + +DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) +{ + /* XXX convert args. */ + pmap_t pmap = (pmap_t)addr; + pt1_entry_t pte1; + pt2_entry_t pte2; + vm_offset_t va; + uint32_t i, start; + + if (have_addr) { + printf("supported only on current pmap\n"); + return; + } + + pmap = PCPU_GET(curpmap); + printf("curpmap: 0x%08X\n", (uint32_t)pmap); + printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); + printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); + + start = pte1_index((vm_offset_t)PT2MAP); + for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { + pte1 = pte1_load(&pmap->pm_pt1[i]); + if (pte1 == 0) + continue; + va = i << PTE1_SHIFT; + if (pte1_is_section(pte1)) { + printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, + !!(pte1 & PTE1_S)); + dump_section(pmap, i); + } else if (pte1_is_link(pte1)) { + pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); + printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, + pte1, pte2); + if (pte2 == 0) + printf(" !!! pt2tab entry is ZERO\n"); + } else + printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); + } + dump_pt2tab(pmap); +} +#endif diff --git a/sys/arm/arm/swtch.S b/sys/arm/arm/swtch.S index 6972c503149..0a0e97ea5c1 100644 --- a/sys/arm/arm/swtch.S +++ b/sys/arm/arm/swtch.S @@ -88,8 +88,6 @@ __FBSDID("$FreeBSD$"); -#define DOMAIN_CLIENT 0x01 - #if defined(_ARM_ARCH_6) && defined(SMP) #define GET_PCPU(tmp, tmp2) \ mrc p15, 0, tmp, c0, c0, 5; \ @@ -109,13 +107,19 @@ __FBSDID("$FreeBSD$"); #endif .Lcurpcpu: - .word _C_LABEL(__pcpu) + .word _C_LABEL(__pcpu) .word PCPU_SIZE -.Lcpufuncs: - .word _C_LABEL(cpufuncs) .Lblocked_lock: .word _C_LABEL(blocked_lock) + +#ifndef ARM_NEW_PMAP + +#define DOMAIN_CLIENT 0x01 + +.Lcpufuncs: + .word _C_LABEL(cpufuncs) + /* * cpu_throw(oldtd, newtd) * @@ -412,6 +416,388 @@ ENTRY(cpu_switch) ldmia r3, {r4-r12, sp, pc} END(cpu_switch) + +#else /* !ARM_NEW_PMAP */ +#include + +ENTRY(cpu_context_switch) /* QQQ: What about macro instead of function? */ + DSB + mcr CP15_TTBR0(r0) /* set the new TTB */ + ISB + mov r0, #(CPU_ASID_KERNEL) + mcr CP15_TLBIASID(r0) /* flush not global TLBs */ + /* + * Flush entire Branch Target Cache because of the branch predictor + * is not architecturally invisible. See ARM Architecture Reference + * Manual ARMv7-A and ARMv7-R edition, page B2-1264(65), Branch + * predictors and Requirements for branch predictor maintenance + * operations sections. + * + * QQQ: The predictor is virtually addressed and holds virtual target + * addresses. Therefore, if mapping is changed, the predictor cache + * must be flushed.The flush is part of entire i-cache invalidation + * what is always called when code mapping is changed. So herein, + * it's the only place where standalone predictor flush must be + * executed in kernel (except self modifying code case). + */ + mcr CP15_BPIALL /* and flush entire Branch Target Cache */ + DSB + mov pc, lr +END(cpu_context_switch) + +/* + * cpu_throw(oldtd, newtd) + * + * Remove current thread state, then select the next thread to run + * and load its state. + * r0 = oldtd + * r1 = newtd + */ +ENTRY(cpu_throw) + mov r10, r0 /* r10 = oldtd */ + mov r11, r1 /* r11 = newtd */ + +#ifdef VFP /* This thread is dying, disable */ + bl _C_LABEL(vfp_discard) /* VFP without preserving state. */ +#endif + GET_PCPU(r8, r9) /* r8 = current pcpu */ + ldr r4, [r8, #PC_CPUID] /* r4 = current cpu id */ + + cmp r10, #0 /* old thread? */ + beq 2f /* no, skip */ + + /* Remove this CPU from the active list. */ + ldr r5, [r8, #PC_CURPMAP] + mov r0, #(PM_ACTIVE) + add r5, r0 /* r5 = old pm_active */ + + /* Compute position and mask. */ +#if _NCPUWORDS > 1 + lsr r0, r4, #3 + bic r0, #3 + add r5, r0 /* r5 = position in old pm_active */ + mov r2, #1 + and r0, r4, #31 + lsl r2, r0 /* r2 = mask */ +#else + mov r2, #1 + lsl r2, r4 /* r2 = mask */ +#endif + /* Clear cpu from old active list. */ +#ifdef SMP +1: ldrex r0, [r5] + bic r0, r2 + strex r1, r0, [r5] + teq r1, #0 + bne 1b +#else + ldr r0, [r5] + bic r0, r2 + str r0, [r5] +#endif + +2: +#ifdef INVARIANTS + cmp r11, #0 /* new thread? */ + beq badsw1 /* no, panic */ +#endif + ldr r7, [r11, #(TD_PCB)] /* r7 = new PCB */ + + /* + * Registers at this point + * r4 = current cpu id + * r7 = new PCB + * r8 = current pcpu + * r11 = newtd + */ + + /* MMU switch to new thread. */ + ldr r0, [r7, #(PCB_PAGEDIR)] +#ifdef INVARIANTS + cmp r0, #0 /* new thread? */ + beq badsw4 /* no, panic */ +#endif + bl _C_LABEL(cpu_context_switch) + + /* + * Set new PMAP as current one. + * Insert cpu to new active list. + */ + + ldr r6, [r11, #(TD_PROC)] /* newtd->proc */ + ldr r6, [r6, #(P_VMSPACE)] /* newtd->proc->vmspace */ + add r6, #VM_PMAP /* newtd->proc->vmspace->pmap */ + str r6, [r8, #PC_CURPMAP] /* store to curpmap */ + + mov r0, #PM_ACTIVE + add r6, r0 /* r6 = new pm_active */ + + /* compute position and mask */ +#if _NCPUWORDS > 1 + lsr r0, r4, #3 + bic r0, #3 + add r6, r0 /* r6 = position in new pm_active */ + mov r2, #1 + and r0, r4, #31 + lsl r2, r0 /* r2 = mask */ +#else + mov r2, #1 + lsl r2, r4 /* r2 = mask */ +#endif + /* Set cpu to new active list. */ +#ifdef SMP +1: ldrex r0, [r6] + orr r0, r2 + strex r1, r0, [r6] + teq r1, #0 + bne 1b +#else + ldr r0, [r6] + orr r0, r2 + str r0, [r6] +#endif + /* + * Registers at this point. + * r7 = new PCB + * r8 = current pcpu + * r11 = newtd + * They must match the ones in sw1 position !!! + */ + DMB + b sw1 /* share new thread init with cpu_switch() */ +END(cpu_throw) + +/* + * cpu_switch(oldtd, newtd, lock) + * + * Save the current thread state, then select the next thread to run + * and load its state. + * r0 = oldtd + * r1 = newtd + * r2 = lock (new lock for old thread) + */ +ENTRY(cpu_switch) + /* Interrupts are disabled. */ +#ifdef INVARIANTS + cmp r0, #0 /* old thread? */ + beq badsw2 /* no, panic */ +#endif + /* Save all the registers in the old thread's pcb. */ + ldr r3, [r0, #(TD_PCB)] + add r3, #(PCB_R4) + stmia r3, {r4-r12, sp, lr, pc} + +#ifdef INVARIANTS + cmp r1, #0 /* new thread? */ + beq badsw3 /* no, panic */ +#endif + /* + * Save arguments. Note that we can now use r0-r14 until + * it is time to restore them for the new thread. However, + * some registers are not safe over function call. + */ + mov r9, r2 /* r9 = lock */ + mov r10, r0 /* r10 = oldtd */ + mov r11, r1 /* r11 = newtd */ + + GET_PCPU(r8, r3) /* r8 = current PCPU */ + ldr r7, [r11, #(TD_PCB)] /* r7 = newtd->td_pcb */ + + + +#ifdef VFP + ldr r3, [r10, #(TD_PCB)] + fmrx r0, fpexc /* If the VFP is enabled */ + tst r0, #(VFPEXC_EN) /* the current thread has */ + movne r1, #1 /* used it, so go save */ + addne r0, r3, #(PCB_VFPSTATE) /* the state into the PCB */ + blne _C_LABEL(vfp_store) /* and disable the VFP. */ +#endif + + /* + * MMU switch. If we're switching to a thread with the same + * address space as the outgoing one, we can skip the MMU switch. + */ + mrc CP15_TTBR0(r1) /* r1 = old TTB */ + ldr r0, [r7, #(PCB_PAGEDIR)] /* r0 = new TTB */ + cmp r0, r1 /* Switching to the TTB? */ + beq sw0 /* same TTB, skip */ + +#if 1 /* Lazy context switch */ + /* Don't switch mapping for kernel threads */ + ldr r1, =pmap_kern_ttb + ldr r1, [r1] /* r1 = kernel TTB */ + cmp r0, r1 /* Switching to kernel TTB? */ + beq sw0 /* kernel TTB, skip */ +#endif + +#ifdef INVARIANTS + cmp r0, #0 /* new thread? */ + beq badsw4 /* no, panic */ +#endif + + bl cpu_context_switch /* new TTB as argument */ + + /* + * Registers at this point + * r7 = new PCB + * r8 = current pcpu + * r9 = lock + * r10 = oldtd + * r11 = newtd + */ + + /* + * Set new PMAP as current one. + * Update active list on PMAPs. + */ + ldr r6, [r11, #TD_PROC] /* newtd->proc */ + ldr r6, [r6, #P_VMSPACE] /* newtd->proc->vmspace */ + add r6, #VM_PMAP /* newtd->proc->vmspace->pmap */ + + ldr r5, [r8, #PC_CURPMAP] /* get old curpmap */ + str r6, [r8, #PC_CURPMAP] /* and save new one */ + + mov r0, #PM_ACTIVE + add r5, r0 /* r5 = old pm_active */ + add r6, r0 /* r6 = new pm_active */ + + /* Compute position and mask. */ + ldr r4, [r8, #PC_CPUID] +#if _NCPUWORDS > 1 + lsr r0, r4, #3 + bic r0, #3 + add r5, r0 /* r5 = position in old pm_active */ + add r6, r0 /* r6 = position in new pm_active */ + mov r2, #1 + and r0, r4, #31 + lsl r2, r0 /* r2 = mask */ +#else + mov r2, #1 + lsl r2, r4 /* r2 = mask */ +#endif + /* Clear cpu from old active list. */ +#ifdef SMP +1: ldrex r0, [r5] + bic r0, r2 + strex r1, r0, [r5] + teq r1, #0 + bne 1b +#else + ldr r0, [r5] + bic r0, r2 + str r0, [r5] +#endif + /* Set cpu to new active list. */ +#ifdef SMP +1: ldrex r0, [r6] + orr r0, r2 + strex r1, r0, [r6] + teq r1, #0 + bne 1b +#else + ldr r0, [r6] + orr r0, r2 + str r0, [r6] +#endif + +sw0: + /* + * Registers at this point + * r7 = new PCB + * r8 = current pcpu + * r9 = lock + * r10 = oldtd + * r11 = newtd + */ + + /* Change the old thread lock. */ + add r5, r10, #TD_LOCK + DMB +1: ldrex r0, [r5] + strex r1, r9, [r5] + teq r1, #0 + bne 1b + DMB + +sw1: + clrex + /* + * Registers at this point + * r7 = new PCB + * r8 = current pcpu + * r11 = newtd + */ + +#if defined(SMP) && defined(SCHED_ULE) + /* + * 386 and amd64 do the blocked lock test only for SMP and SCHED_ULE + * QQQ: What does it mean in reality and why is it done? + */ + ldr r6, =blocked_lock +1: + ldr r3, [r11, #TD_LOCK] /* atomic write regular read */ + cmp r3, r6 + beq 1b +#endif + /* Set the new tls */ + ldr r0, [r11, #(TD_MD + MD_TP)] + mcr CP15_TPIDRURO(r0) /* write tls thread reg 2 */ + + /* We have a new curthread now so make a note it */ + str r11, [r8, #PC_CURTHREAD] + mcr CP15_TPIDRPRW(r11) + + /* store pcb in per cpu structure */ + str r7, [r8, #PC_CURPCB] + + /* + * Restore all saved registers and return. Note that some saved + * registers can be changed when either cpu_fork(), cpu_set_upcall(), + * cpu_set_fork_handler(), or makectx() was called. + */ + add r3, r7, #PCB_R4 + ldmia r3, {r4-r12, sp, pc} + +#ifdef INVARIANTS +badsw1: + ldr r0, =sw1_panic_str + bl _C_LABEL(panic) +1: nop + b 1b + +badsw2: + ldr r0, =sw2_panic_str + bl _C_LABEL(panic) +1: nop + b 1b + +badsw3: + ldr r0, =sw3_panic_str + bl _C_LABEL(panic) +1: nop + b 1b + +badsw4: + ldr r0, =sw4_panic_str + bl _C_LABEL(panic) +1: nop + b 1b + +sw1_panic_str: + .asciz "cpu_throw: no newthread supplied.\n" +sw2_panic_str: + .asciz "cpu_switch: no curthread supplied.\n" +sw3_panic_str: + .asciz "cpu_switch: no newthread supplied.\n" +sw4_panic_str: + .asciz "cpu_switch: new pagedir is NULL.\n" +#endif +END(cpu_switch) + + +#endif /* !ARM_NEW_PMAP */ + ENTRY(savectx) stmfd sp!, {lr} sub sp, sp, #4 diff --git a/sys/arm/include/machdep.h b/sys/arm/include/machdep.h index ee524a85247..0f43284c852 100644 --- a/sys/arm/include/machdep.h +++ b/sys/arm/include/machdep.h @@ -5,10 +5,16 @@ #define _MACHDEP_BOOT_MACHDEP_H_ /* Structs that need to be initialised by initarm */ +#ifdef ARM_NEW_PMAP +extern vm_offset_t irqstack; +extern vm_offset_t undstack; +extern vm_offset_t abtstack; +#else struct pv_addr; extern struct pv_addr irqstack; extern struct pv_addr undstack; extern struct pv_addr abtstack; +#endif /* Define various stack sizes in pages */ #define IRQ_STACK_SIZE 1 diff --git a/sys/arm/include/pcb.h b/sys/arm/include/pcb.h index b5ed607fc21..71c3c26a63b 100644 --- a/sys/arm/include/pcb.h +++ b/sys/arm/include/pcb.h @@ -52,11 +52,14 @@ struct pcb { #define PCB_OWNFPU 0x00000001 #define PCB_NOALIGNFLT 0x00000002 caddr_t pcb_onfault; /* On fault handler */ +#ifdef ARM_NEW_PMAP + uint32_t pcb_pagedir; /* TTB0 value */ +#else vm_offset_t pcb_pagedir; /* PT hooks */ uint32_t *pcb_pl1vec; /* PTR to vector_base L1 entry*/ uint32_t pcb_l1vec; /* Value to stuff on ctx sw */ u_int pcb_dacr; /* Domain Access Control Reg */ - +#endif struct vfp_state pcb_vfpstate; /* VP/NEON state */ u_int pcb_vfpcpu; /* VP/NEON last cpu */ } __aligned(8); /* diff --git a/sys/arm/include/pmap-v6.h b/sys/arm/include/pmap-v6.h new file mode 100644 index 00000000000..54c0f856634 --- /dev/null +++ b/sys/arm/include/pmap-v6.h @@ -0,0 +1,313 @@ +/*- + * Copyright 2014 Svatopluk Kraus + * Copyright 2014 Michal Meloun + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The ARM version of this file was more or less based on the i386 version, + * which has the following provenance... + * + * Derived from hp300 version by Mike Hibler, this version by William + * Jolitz uses a recursive map [a pde points to the page directory] to + * map the page tables using the pagetables themselves. This is done to + * reduce the impact on kernel virtual memory for lots of sparse address + * space, and to reduce the cost of memory to each process. + * + * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 + * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 + * from: FreeBSD: src/sys/i386/include/pmap.h,v 1.70 2000/11/30 + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_PMAP_H_ +#define _MACHINE_PMAP_H_ + +#include +#include +#include +#include + +typedef uint32_t pt1_entry_t; /* L1 table entry */ +typedef uint32_t pt2_entry_t; /* L2 table entry */ +typedef uint32_t ttb_entry_t; /* TTB entry */ + +#ifdef _KERNEL + +#if 0 +#define PMAP_PTE_NOCACHE // Use uncached page tables +#endif + +/* + * (1) During pmap bootstrap, physical pages for L2 page tables are + * allocated in advance which are used for KVA continuous mapping + * starting from KERNBASE. This makes things more simple. + * (2) During vm subsystem initialization, only vm subsystem itself can + * allocate physical memory safely. As pmap_map() is called during + * this initialization, we must be prepared for that and have some + * preallocated physical pages for L2 page tables. + * + * Note that some more pages for L2 page tables are preallocated too + * for mappings laying above VM_MAX_KERNEL_ADDRESS. + */ +#ifndef NKPT2PG +/* + * The optimal way is to define this in board configuration as + * definition here must be safe enough. It means really big. + * + * 1 GB KVA <=> 256 kernel L2 page table pages + * + * From real platforms: + * 1 GB physical memory <=> 10 pages is enough + * 2 GB physical memory <=> 21 pages is enough + */ +#define NKPT2PG 32 +#endif + +extern vm_paddr_t phys_avail[]; +extern vm_paddr_t dump_avail[]; +extern char *_tmppt; /* poor name! */ +extern vm_offset_t virtual_avail; +extern vm_offset_t virtual_end; + +/* + * Pmap stuff + */ + +/* + * This structure is used to hold a virtual<->physical address + * association and is used mostly by bootstrap code + */ +struct pv_addr { + SLIST_ENTRY(pv_addr) pv_list; + vm_offset_t pv_va; + vm_paddr_t pv_pa; +}; +#endif +struct pv_entry; +struct pv_chunk; + +struct md_page { + TAILQ_HEAD(,pv_entry) pv_list; + uint16_t pt2_wirecount[4]; + int pat_mode; +}; + +struct pmap { + struct mtx pm_mtx; + pt1_entry_t *pm_pt1; /* KVA of pt1 */ + pt2_entry_t *pm_pt2tab; /* KVA of pt2 pages table */ + TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ + cpuset_t pm_active; /* active on cpus */ + struct pmap_statistics pm_stats; /* pmap statictics */ + LIST_ENTRY(pmap) pm_list; /* List of all pmaps */ +}; + +typedef struct pmap *pmap_t; + +#ifdef _KERNEL +extern struct pmap kernel_pmap_store; +#define kernel_pmap (&kernel_pmap_store) + +#define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) +#define PMAP_LOCK_ASSERT(pmap, type) \ + mtx_assert(&(pmap)->pm_mtx, (type)) +#define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) +#define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \ + NULL, MTX_DEF | MTX_DUPOK) +#define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) +#define PMAP_MTX(pmap) (&(pmap)->pm_mtx) +#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) +#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) +#endif + +/* + * For each vm_page_t, there is a list of all currently valid virtual + * mappings of that page. An entry is a pv_entry_t, the list is pv_list. + */ +typedef struct pv_entry { + vm_offset_t pv_va; /* virtual address for mapping */ + TAILQ_ENTRY(pv_entry) pv_next; +} *pv_entry_t; + +/* + * pv_entries are allocated in chunks per-process. This avoids the + * need to track per-pmap assignments. + */ +#define _NPCM 11 +#define _NPCPV 336 +struct pv_chunk { + pmap_t pc_pmap; + TAILQ_ENTRY(pv_chunk) pc_list; + uint32_t pc_map[_NPCM]; /* bitmap; 1 = free */ + TAILQ_ENTRY(pv_chunk) pc_lru; + struct pv_entry pc_pventry[_NPCPV]; +}; + +#ifdef _KERNEL +struct pcb; +extern ttb_entry_t pmap_kern_ttb; /* TTB for kernel pmap */ + +#define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) +#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) + +/* + * Only the following functions or macros may be used before pmap_bootstrap() + * is called: pmap_kenter(), pmap_kextract(), pmap_kremove(), vtophys(), and + * vtopte2(). + */ +void pmap_bootstrap(vm_offset_t ); +void pmap_kenter(vm_offset_t , vm_paddr_t ); +void *pmap_kenter_temporary(vm_paddr_t , int ); +void pmap_kremove(vm_offset_t); +void *pmap_mapdev(vm_paddr_t, vm_size_t); +void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); +boolean_t pmap_page_is_mapped(vm_page_t ); +void pmap_page_set_memattr(vm_page_t , vm_memattr_t ); +void pmap_unmapdev(vm_offset_t, vm_size_t); +void pmap_kenter_device(vm_offset_t , vm_paddr_t ); +void pmap_set_pcb_pagedir(pmap_t , struct pcb *); +void pmap_lazyfix_action(void); + +void pmap_tlb_flush(pmap_t , vm_offset_t ); +void pmap_tlb_flush_range(pmap_t , vm_offset_t , vm_size_t ); +void pmap_tlb_flush_ng(pmap_t ); + +void pmap_dcache_wb_range(vm_paddr_t , vm_size_t , vm_memattr_t ); + +vm_paddr_t pmap_kextract(vm_offset_t ); +int pmap_fault(pmap_t , vm_offset_t , uint32_t , int , int ); +#define vtophys(va) pmap_kextract((vm_offset_t)(va)) + +void pmap_set_tex(void); +void reinit_mmu(ttb_entry_t ttb, u_int aux_clr, u_int aux_set); + +/* + * Pre-bootstrap epoch functions set. + */ +void pmap_bootstrap_prepare(vm_paddr_t ); +vm_paddr_t pmap_preboot_get_pages(u_int ); +void pmap_preboot_map_pages(vm_paddr_t , vm_offset_t , u_int ); +vm_offset_t pmap_preboot_reserve_pages(u_int ); +vm_offset_t pmap_preboot_get_vpages(u_int ); +void pmap_preboot_map_attr(vm_paddr_t , vm_offset_t , vm_size_t , + int , int ); +static __inline void +pmap_map_chunk(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, + vm_size_t size, int prot, int cache) +{ + pmap_preboot_map_attr(pa, va, size, prot, cache); +} + +/* + * This structure is used by machine-dependent code to describe + * static mappings of devices, created at bootstrap time. + */ +struct pmap_devmap { + vm_offset_t pd_va; /* virtual address */ + vm_paddr_t pd_pa; /* physical address */ + vm_size_t pd_size; /* size of region */ + vm_prot_t pd_prot; /* protection code */ + int pd_cache; /* cache attributes */ +}; + +void pmap_devmap_bootstrap(const struct pmap_devmap *); + +#endif /* _KERNEL */ + +// ----------------- TO BE DELETED --------------------------------------------- +#include + +#ifdef _KERNEL + +/* + * sys/arm/arm/elf_trampoline.c + * sys/arm/arm/genassym.c + * sys/arm/arm/machdep.c + * sys/arm/arm/mp_machdep.c + * sys/arm/arm/locore.S + * sys/arm/arm/pmap.c + * sys/arm/arm/swtch.S + * sys/arm/at91/at91_machdep.c + * sys/arm/cavium/cns11xx/econa_machdep.c + * sys/arm/s3c2xx0/s3c24x0_machdep.c + * sys/arm/xscale/ixp425/avila_machdep.c + * sys/arm/xscale/i8134x/crb_machdep.c + * sys/arm/xscale/i80321/ep80219_machdep.c + * sys/arm/xscale/i80321/iq31244_machdep.c + * sys/arm/xscale/pxa/pxa_machdep.c + */ +#define PMAP_DOMAIN_KERNEL 0 /* The kernel uses domain #0 */ + +/* + * sys/arm/arm/busdma_machdep-v6.c + */ +int pmap_dmap_iscurrent(pmap_t pmap); + +/* + * sys/arm/arm/cpufunc.c + */ +void pmap_pte_init_mmu_v6(void); +void vector_page_setprot(int); + + +/* + * sys/arm/arm/db_interface.c + * sys/arm/arm/machdep.c + * sys/arm/arm/minidump_machdep.c + * sys/arm/arm/pmap.c + */ +#define pmap_kernel() kernel_pmap + +/* + * sys/arm/arm/bus_space_generic.c (just comment) + * sys/arm/arm/devmap.c + * sys/arm/arm/pmap.c (just comment) + * sys/arm/at91/at91_machdep.c + * sys/arm/cavium/cns11xx/econa_machdep.c + * sys/arm/freescale/imx/imx6_machdep.c (just comment) + * sys/arm/mv/orion/db88f5xxx.c + * sys/arm/mv/mv_localbus.c + * sys/arm/mv/mv_machdep.c + * sys/arm/mv/mv_pci.c + * sys/arm/s3c2xx0/s3c24x0_machdep.c + * sys/arm/versatile/versatile_machdep.c + * sys/arm/xscale/ixp425/avila_machdep.c + * sys/arm/xscale/i8134x/crb_machdep.c + * sys/arm/xscale/i80321/ep80219_machdep.c + * sys/arm/xscale/i80321/iq31244_machdep.c + * sys/arm/xscale/pxa/pxa_machdep.c + */ +#define PTE_DEVICE PTE2_ATTR_DEVICE + + + +#endif /* _KERNEL */ +// ----------------------------------------------------------------------------- + +#endif /* !_MACHINE_PMAP_H_ */ diff --git a/sys/arm/include/pmap.h b/sys/arm/include/pmap.h index 3f0b9e984d7..ea4d7c0a674 100644 --- a/sys/arm/include/pmap.h +++ b/sys/arm/include/pmap.h @@ -46,6 +46,9 @@ * * $FreeBSD$ */ +#ifdef ARM_NEW_PMAP +#include +#else /* ARM_NEW_PMAP */ #ifndef _MACHINE_PMAP_H_ #define _MACHINE_PMAP_H_ @@ -706,3 +709,4 @@ extern vm_paddr_t dump_avail[]; #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ +#endif /* !ARM_NEW_PMAP */ \ No newline at end of file diff --git a/sys/arm/include/pmap_var.h b/sys/arm/include/pmap_var.h new file mode 100644 index 00000000000..a8b14501d68 --- /dev/null +++ b/sys/arm/include/pmap_var.h @@ -0,0 +1,511 @@ +/*- + * Copyright 2014 Svatopluk Kraus + * Copyright 2014 Michal Meloun + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_PMAP_VAR_H_ +#define _MACHINE_PMAP_VAR_H_ + +#include +/* + * Various PMAP defines, exports, and inline functions + * definitions also usable in other MD code. + */ + +/* A number of pages in L1 page table. */ +#define NPG_IN_PT1 (NB_IN_PT1 / PAGE_SIZE) + +/* A number of L2 page tables in a page. */ +#define NPT2_IN_PG (PAGE_SIZE / NB_IN_PT2) + +/* A number of L2 page table entries in a page. */ +#define NPTE2_IN_PG (NPT2_IN_PG * NPTE2_IN_PT2) + +#ifdef _KERNEL + +/* + * A L2 page tables page contains NPT2_IN_PG L2 page tables. Masking of + * pte1_idx by PT2PG_MASK gives us an index to associated L2 page table + * in a page. The PT2PG_SHIFT definition depends on NPT2_IN_PG strictly. + * I.e., (1 << PT2PG_SHIFT) == NPT2_IN_PG must be fulfilled. + */ +#define PT2PG_SHIFT 2 +#define PT2PG_MASK ((1 << PT2PG_SHIFT) - 1) + +/* + * A PT2TAB holds all allocated L2 page table pages in a pmap. + * Right shifting of virtual address by PT2TAB_SHIFT gives us an index + * to L2 page table page in PT2TAB which holds the address mapping. + */ +#define PT2TAB_ENTRIES (NPTE1_IN_PT1 / NPT2_IN_PG) +#define PT2TAB_SHIFT (PTE1_SHIFT + PT2PG_SHIFT) + +/* + * All allocated L2 page table pages in a pmap are mapped into PT2MAP space. + * An virtual address right shifting by PT2MAP_SHIFT gives us an index to PTE2 + * which maps the address. + */ +#define PT2MAP_SIZE (NPTE1_IN_PT1 * NB_IN_PT2) +#define PT2MAP_SHIFT PTE2_SHIFT + +extern pt1_entry_t *kern_pt1; +extern pt2_entry_t *kern_pt2tab; +extern pt2_entry_t *PT2MAP; + +/* + * Virtual interface for L1 page table management. + */ + +static __inline u_int +pte1_index(vm_offset_t va) +{ + + return (va >> PTE1_SHIFT); +} + +static __inline pt1_entry_t * +pte1_ptr(pt1_entry_t *pt1, vm_offset_t va) +{ + + return (pt1 + pte1_index(va)); +} + +static __inline vm_offset_t +pte1_trunc(vm_offset_t va) +{ + + return (va & PTE1_FRAME); +} + +static __inline vm_offset_t +pte1_roundup(vm_offset_t va) +{ + + return ((va + PTE1_OFFSET) & PTE1_FRAME); +} + +/* + * Virtual interface for L1 page table entries management. + * + * XXX: Some of the following functions now with a synchronization barrier + * are called in a loop, so it could be useful to have two versions of them. + * One with the barrier and one without the barrier. In this case, pure + * barrier pte1_sync() should be implemented as well. + */ +static __inline void +pte1_sync(pt1_entry_t *pte1p) +{ + + dsb(); +#ifndef PMAP_PTE_NOCACHE + if (!cpuinfo.coherent_walk) + dcache_wb_pou((vm_offset_t)pte1p, sizeof(*pte1p)); +#endif +} + +static __inline void +pte1_sync_range(pt1_entry_t *pte1p, vm_size_t size) +{ + + dsb(); +#ifndef PMAP_PTE_NOCACHE + if (!cpuinfo.coherent_walk) + dcache_wb_pou((vm_offset_t)pte1p, size); +#endif +} + +static __inline void +pte1_store(pt1_entry_t *pte1p, pt1_entry_t pte1) +{ + + atomic_store_rel_int(pte1p, pte1); + pte1_sync(pte1p); +} + +static __inline void +pte1_clear(pt1_entry_t *pte1p) +{ + + pte1_store(pte1p, 0); +} + +static __inline void +pte1_clear_bit(pt1_entry_t *pte1p, uint32_t bit) +{ + + atomic_clear_int(pte1p, bit); + pte1_sync(pte1p); +} + +static __inline boolean_t +pte1_cmpset(pt1_entry_t *pte1p, pt1_entry_t opte1, pt1_entry_t npte1) +{ + boolean_t ret; + + ret = atomic_cmpset_int(pte1p, opte1, npte1); + if (ret) pte1_sync(pte1p); + + return (ret); +} + +static __inline boolean_t +pte1_is_link(pt1_entry_t pte1) +{ + + return ((pte1 & L1_TYPE_MASK) == L1_TYPE_C); +} + +static __inline int +pte1_is_section(pt1_entry_t pte1) +{ + + return ((pte1 & L1_TYPE_MASK) == L1_TYPE_S); +} + +static __inline boolean_t +pte1_is_dirty(pt1_entry_t pte1) +{ + + return ((pte1 & (PTE1_NM | PTE1_RO)) == 0); +} + +static __inline boolean_t +pte1_is_global(pt1_entry_t pte1) +{ + + return ((pte1 & PTE1_NG) == 0); +} + +static __inline boolean_t +pte1_is_valid(pt1_entry_t pte1) +{ + int l1_type; + + l1_type = pte1 & L1_TYPE_MASK; + return ((l1_type == L1_TYPE_C) || (l1_type == L1_TYPE_S)); +} + +static __inline boolean_t +pte1_is_wired(pt1_entry_t pte1) +{ + + return (pte1 & PTE1_W); +} + +static __inline pt1_entry_t +pte1_load(pt1_entry_t *pte1p) +{ + pt1_entry_t pte1; + + pte1 = *pte1p; + return (pte1); +} + +static __inline pt1_entry_t +pte1_load_clear(pt1_entry_t *pte1p) +{ + pt1_entry_t opte1; + + opte1 = atomic_readandclear_int(pte1p); + pte1_sync(pte1p); + return (opte1); +} + +static __inline void +pte1_set_bit(pt1_entry_t *pte1p, uint32_t bit) +{ + + atomic_set_int(pte1p, bit); + pte1_sync(pte1p); +} + +static __inline vm_paddr_t +pte1_pa(pt1_entry_t pte1) +{ + + return ((vm_paddr_t)(pte1 & PTE1_FRAME)); +} + +static __inline vm_paddr_t +pte1_link_pa(pt1_entry_t pte1) +{ + + return ((vm_paddr_t)(pte1 & L1_C_ADDR_MASK)); +} + +/* + * Virtual interface for L2 page table entries management. + * + * XXX: Some of the following functions now with a synchronization barrier + * are called in a loop, so it could be useful to have two versions of them. + * One with the barrier and one without the barrier. + */ + +static __inline void +pte2_sync(pt2_entry_t *pte2p) +{ + + dsb(); +#ifndef PMAP_PTE_NOCACHE + if (!cpuinfo.coherent_walk) + dcache_wb_pou((vm_offset_t)pte2p, sizeof(*pte2p)); +#endif +} + +static __inline void +pte2_sync_range(pt2_entry_t *pte2p, vm_size_t size) +{ + + dsb(); +#ifndef PMAP_PTE_NOCACHE + if (!cpuinfo.coherent_walk) + dcache_wb_pou((vm_offset_t)pte2p, size); +#endif +} + +static __inline void +pte2_store(pt2_entry_t *pte2p, pt2_entry_t pte2) +{ + + atomic_store_rel_int(pte2p, pte2); + pte2_sync(pte2p); +} + +static __inline void +pte2_clear(pt2_entry_t *pte2p) +{ + + pte2_store(pte2p, 0); +} + +static __inline void +pte2_clear_bit(pt2_entry_t *pte2p, uint32_t bit) +{ + + atomic_clear_int(pte2p, bit); + pte2_sync(pte2p); +} + +static __inline boolean_t +pte2_cmpset(pt2_entry_t *pte2p, pt2_entry_t opte2, pt2_entry_t npte2) +{ + boolean_t ret; + + ret = atomic_cmpset_int(pte2p, opte2, npte2); + if (ret) pte2_sync(pte2p); + + return (ret); +} + +static __inline boolean_t +pte2_is_dirty(pt2_entry_t pte2) +{ + + return ((pte2 & (PTE2_NM | PTE2_RO)) == 0); +} + +static __inline boolean_t +pte2_is_global(pt2_entry_t pte2) +{ + + return ((pte2 & PTE2_NG) == 0); +} + +static __inline boolean_t +pte2_is_valid(pt2_entry_t pte2) +{ + + return (pte2 & PTE2_V); +} + +static __inline boolean_t +pte2_is_wired(pt2_entry_t pte2) +{ + + return (pte2 & PTE2_W); +} + +static __inline pt2_entry_t +pte2_load(pt2_entry_t *pte2p) +{ + pt2_entry_t pte2; + + pte2 = *pte2p; + return (pte2); +} + +static __inline pt2_entry_t +pte2_load_clear(pt2_entry_t *pte2p) +{ + pt2_entry_t opte2; + + opte2 = atomic_readandclear_int(pte2p); + pte2_sync(pte2p); + return (opte2); +} + +static __inline void +pte2_set_bit(pt2_entry_t *pte2p, uint32_t bit) +{ + + atomic_set_int(pte2p, bit); + pte2_sync(pte2p); +} + +static __inline void +pte2_set_wired(pt2_entry_t *pte2p, boolean_t wired) +{ + + /* + * Wired bit is transparent for page table walk, + * so pte2_sync() is not needed. + */ + if (wired) + atomic_set_int(pte2p, PTE2_W); + else + atomic_clear_int(pte2p, PTE2_W); +} + +static __inline vm_paddr_t +pte2_pa(pt2_entry_t pte2) +{ + + return ((vm_paddr_t)(pte2 & PTE2_FRAME)); +} + +static __inline u_int +pte2_attr(pt2_entry_t pte2) +{ + + return ((u_int)(pte2 & PTE2_ATTR_MASK)); +} + +/* + * Virtual interface for L2 page tables mapping management. + */ + +static __inline u_int +pt2tab_index(vm_offset_t va) +{ + + return (va >> PT2TAB_SHIFT); +} + +static __inline pt2_entry_t * +pt2tab_entry(pt2_entry_t *pt2tab, vm_offset_t va) +{ + + return (pt2tab + pt2tab_index(va)); +} + +static __inline void +pt2tab_store(pt2_entry_t *pte2p, pt2_entry_t pte2) +{ + + pte2_store(pte2p,pte2); +} + +static __inline pt2_entry_t +pt2tab_load(pt2_entry_t *pte2p) +{ + + return (pte2_load(pte2p)); +} + +static __inline pt2_entry_t +pt2tab_load_clear(pt2_entry_t *pte2p) +{ + + return (pte2_load_clear(pte2p)); +} + +static __inline u_int +pt2map_index(vm_offset_t va) +{ + + return (va >> PT2MAP_SHIFT); +} + +static __inline pt2_entry_t * +pt2map_entry(vm_offset_t va) +{ + + return (PT2MAP + pt2map_index(va)); +} + +/* + * Virtual interface for pmap structure & kernel shortcuts. + */ + +static __inline pt1_entry_t * +pmap_pte1(pmap_t pmap, vm_offset_t va) +{ + + return (pte1_ptr(pmap->pm_pt1, va)); +} + +static __inline pt1_entry_t * +kern_pte1(vm_offset_t va) +{ + + return (pte1_ptr(kern_pt1, va)); +} + +static __inline pt2_entry_t * +pmap_pt2tab_entry(pmap_t pmap, vm_offset_t va) +{ + + return (pt2tab_entry(pmap->pm_pt2tab, va)); +} + +static __inline pt2_entry_t * +kern_pt2tab_entry(vm_offset_t va) +{ + + return (pt2tab_entry(kern_pt2tab, va)); +} + +static __inline vm_page_t +pmap_pt2_page(pmap_t pmap, vm_offset_t va) +{ + pt2_entry_t pte2; + + pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); + return (PHYS_TO_VM_PAGE(pte2 & PTE2_FRAME)); +} + +static __inline vm_page_t +kern_pt2_page(vm_offset_t va) +{ + pt2_entry_t pte2; + + pte2 = pte2_load(kern_pt2tab_entry(va)); + return (PHYS_TO_VM_PAGE(pte2 & PTE2_FRAME)); +} + +#endif /* _KERNEL */ +#endif /* !_MACHINE_PMAP_VAR_H_ */ diff --git a/sys/arm/include/pte-v6.h b/sys/arm/include/pte-v6.h new file mode 100644 index 00000000000..536c1e1258d --- /dev/null +++ b/sys/arm/include/pte-v6.h @@ -0,0 +1,327 @@ +/*- + * Copyright 2014 Svatopluk Kraus + * Copyright 2014 Michal Meloun + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_PTE_H_ +#define _MACHINE_PTE_H_ + +/* + * Domain Types for the Domain Access Control Register. + */ +#define DOMAIN_FAULT 0x00 /* no access */ +#define DOMAIN_CLIENT 0x01 /* client */ +#define DOMAIN_RESERVED 0x02 /* reserved */ +#define DOMAIN_MANAGER 0x03 /* manager */ + +/* + * TEX remap registers attributes + */ +#define PRRR_SO 0 /* Strongly ordered memory */ +#define PRRR_DEV 1 /* Device memory */ +#define PRRR_MEM 2 /* Normal memory */ +#define PRRR_DS0 (1 << 16) /* Shared bit for Device, S = 0 */ +#define PRRR_DS1 (1 << 17) /* Shared bit for Device, S = 1 */ +#define PRRR_NS0 (1 << 18) /* Shared bit for Normal, S = 0 */ +#define PRRR_NS1 (1 << 19) /* Shared bit for Normal, S = 1 */ +#define PRRR_NOS_SHIFT 24 /* base shif for Not Outer Shared bits */ + +#define NMRR_NC 0 /* Noncachable*/ +#define NMRR_WB_WA 1 /* Write Back, Write Allocate */ +#define NMRR_WT 2 /* Write Through, Non-Write Allocate */ +#define NMRR_WB 3 /* Write Back, Non-Write Allocate */ + +/* + * + * The ARM MMU is capable of mapping memory in the following chunks: + * + * 16M Supersections (L1 table) + * + * 1M Sections (L1 table) + * + * 64K Large Pages (L2 table) + * + * 4K Small Pages (L2 table) + * + * + * Coarse Tables can map Large and Small Pages. + * Coarse Tables are 1K in length. + * + * The Translation Table Base register holds the pointer to the + * L1 Table. The L1 Table is a 16K contiguous chunk of memory + * aligned to a 16K boundary. Each entry in the L1 Table maps + * 1M of virtual address space, either via a Section mapping or + * via an L2 Table. + * + */ +#define L1_TABLE_SIZE 0x4000 /* 16K */ +#define L1_ENTRIES 0x1000 /* 4K */ +#define L2_TABLE_SIZE 0x0400 /* 1K */ +#define L2_ENTRIES 0x0100 /* 256 */ + +/* ARMv6 super-sections. */ +#define L1_SUP_SIZE 0x01000000 /* 16M */ +#define L1_SUP_OFFSET (L1_SUP_SIZE - 1) +#define L1_SUP_FRAME (~L1_SUP_OFFSET) +#define L1_SUP_SHIFT 24 + +#define L1_S_SIZE 0x00100000 /* 1M */ +#define L1_S_OFFSET (L1_S_SIZE - 1) +#define L1_S_FRAME (~L1_S_OFFSET) +#define L1_S_SHIFT 20 + +#define L2_L_SIZE 0x00010000 /* 64K */ +#define L2_L_OFFSET (L2_L_SIZE - 1) +#define L2_L_FRAME (~L2_L_OFFSET) +#define L2_L_SHIFT 16 + +#define L2_S_SIZE 0x00001000 /* 4K */ +#define L2_S_OFFSET (L2_S_SIZE - 1) +#define L2_S_FRAME (~L2_S_OFFSET) +#define L2_S_SHIFT 12 + +/* + * ARM MMU L1 Descriptors + */ +#define L1_TYPE_INV 0x00 /* Invalid (fault) */ +#define L1_TYPE_C 0x01 /* Coarse L2 */ +#define L1_TYPE_S 0x02 /* Section */ +#define L1_TYPE_MASK 0x03 /* Mask of type bits */ + +/* L1 Section Descriptor */ +#define L1_S_B 0x00000004 /* bufferable Section */ +#define L1_S_C 0x00000008 /* cacheable Section */ +#define L1_S_NX 0x00000010 /* not executeable */ +#define L1_S_DOM(x) ((x) << 5) /* domain */ +#define L1_S_DOM_MASK L1_S_DOM(0xf) +#define L1_S_P 0x00000200 /* ECC enable for this section */ +#define L1_S_AP(x) ((x) << 10) /* access permissions */ +#define L1_S_AP0 0x00000400 /* access permissions bit 0 */ +#define L1_S_AP1 0x00000800 /* access permissions bit 1 */ +#define L1_S_TEX(x) ((x) << 12) /* type extension */ +#define L1_S_TEX0 0x00001000 /* type extension bit 0 */ +#define L1_S_TEX1 0x00002000 /* type extension bit 1 */ +#define L1_S_TEX2 0x00004000 /* type extension bit 2 */ +#define L1_S_AP2 0x00008000 /* access permissions bit 2 */ +#define L1_S_SHARED 0x00010000 /* shared */ +#define L1_S_NG 0x00020000 /* not global */ +#define L1_S_SUPERSEC 0x00040000 /* Section is a super-section. */ +#define L1_S_ADDR_MASK 0xfff00000 /* phys address of section */ + +/* L1 Coarse Descriptor */ +#define L1_C_DOM(x) ((x) << 5) /* domain */ +#define L1_C_DOM_MASK L1_C_DOM(0xf) +#define L1_C_P 0x00000200 /* ECC enable for this section */ +#define L1_C_ADDR_MASK 0xfffffc00 /* phys address of L2 Table */ + +/* + * ARM MMU L2 Descriptors + */ +#define L2_TYPE_INV 0x00 /* Invalid (fault) */ +#define L2_TYPE_L 0x01 /* Large Page - 64k - not used yet*/ +#define L2_TYPE_S 0x02 /* Small Page - 4 */ +#define L2_TYPE_MASK 0x03 + +#define L2_NX 0x00000001 /* Not executable */ +#define L2_B 0x00000004 /* Bufferable page */ +#define L2_C 0x00000008 /* Cacheable page */ +#define L2_AP(x) ((x) << 4) +#define L2_AP0 0x00000010 /* access permissions bit 0*/ +#define L2_AP1 0x00000020 /* access permissions bit 1*/ +#define L2_TEX(x) ((x) << 6) /* type extension */ +#define L2_TEX0 0x00000040 /* type extension bit 0 */ +#define L2_TEX1 0x00000080 /* type extension bit 1 */ +#define L2_TEX2 0x00000100 /* type extension bit 2 */ +#define L2_AP2 0x00000200 /* access permissions bit 2*/ +#define L2_SHARED 0x00000400 /* shared */ +#define L2_NG 0x00000800 /* not global */ + +/* + * TEX classes encoding + */ +#define TEX1_CLASS_0 ( 0) +#define TEX1_CLASS_1 ( L1_S_B) +#define TEX1_CLASS_2 ( L1_S_C ) +#define TEX1_CLASS_3 ( L1_S_C | L1_S_B) +#define TEX1_CLASS_4 (L1_S_TEX0 ) +#define TEX1_CLASS_5 (L1_S_TEX0 | L1_S_B) +#define TEX1_CLASS_6 (L1_S_TEX0 | L1_S_C ) /* Reserved for ARM11 */ +#define TEX1_CLASS_7 (L1_S_TEX0 | L1_S_C | L1_S_B) + +#define TEX2_CLASS_0 ( 0) +#define TEX2_CLASS_1 ( L2_B) +#define TEX2_CLASS_2 ( L2_C ) +#define TEX2_CLASS_3 ( L2_C | L2_B) +#define TEX2_CLASS_4 (L2_TEX0 ) +#define TEX2_CLASS_5 (L2_TEX0 | L2_B) +#define TEX2_CLASS_6 (L2_TEX0 | L2_C ) /* Reserved for ARM11 */ +#define TEX2_CLASS_7 (L2_TEX0 | L2_C | L2_B) + +/* L1 table definitions. */ +#define NB_IN_PT1 L1_TABLE_SIZE +#define NPTE1_IN_PT1 L1_ENTRIES + +/* L2 table definitions. */ +#define NB_IN_PT2 L2_TABLE_SIZE +#define NPTE2_IN_PT2 L2_ENTRIES + +/* + * Map memory attributes to TEX classes + */ +#define PTE2_ATTR_WB_WA TEX2_CLASS_0 +#define PTE2_ATTR_NOCACHE TEX2_CLASS_1 +#define PTE2_ATTR_DEVICE TEX2_CLASS_2 +#define PTE2_ATTR_SO TEX2_CLASS_3 +/* + * Software defined bits for L1 descriptors + * - L1_AP0 is used as page accessed bit + * - L1_AP2 (RO / not RW) is used as page not modified bit + * - L1_TEX0 is used as software emulated RO bit + */ +#define PTE1_V L1_TYPE_S /* Valid bit */ +#define PTE1_A L1_S_AP0 /* Accessed - software emulated */ +#define PTE1_NM L1_S_AP2 /* not modified bit - software emulated + * used as real write enable bit */ +#define PTE1_M 0 /* Modified (dummy) */ +#define PTE1_S L1_S_SHARED /* Shared */ +#define PTE1_NG L1_S_NG /* Not global */ +#define PTE1_G 0 /* Global (dummy) */ +#define PTE1_NX L1_S_NX /* Not executable */ +#define PTE1_X 0 /* Executable (dummy) */ +#define PTE1_RO L1_S_TEX1 /* Read Only */ +#define PTE1_RW 0 /* Read-Write (dummy) */ +#define PTE1_U L1_S_AP1 /* User */ +#define PTE1_NU 0 /* Not user (kernel only) (dummy) */ +#define PTE1_W L1_S_TEX2 /* Wired */ + +#define PTE1_SHIFT L1_S_SHIFT +#define PTE1_SIZE L1_S_SIZE +#define PTE1_OFFSET L1_S_OFFSET +#define PTE1_FRAME L1_S_FRAME + +#define PTE1_ATTR_MASK (L1_S_TEX0 | L1_S_C | L1_S_B) + +#define PTE1_AP_KR (PTE1_RO | PTE1_NM) +#define PTE1_AP_KRW 0 +#define PTE1_AP_KRUR (PTE1_RO | PTE1_NM | PTE1_U) +#define PTE1_AP_KRWURW PTE1_U + +/* + * PTE1 descriptors creation macros. + */ +#define PTE1_PA(pa) ((pa) & PTE1_FRAME) +#define PTE1_AP_COMMON (PTE1_V | PTE1_S) + +#define PTE1(pa, ap, attr) (PTE1_PA(pa) | (ap) | (attr) | PTE1_AP_COMMON) + +#define PTE1_KERN(pa, ap, attr) PTE1(pa, (ap) | PTE1_A | PTE1_G, attr) +#define PTE1_KERN_NG(pa, ap, attr) PTE1(pa, (ap) | PTE1_A | PTE1_NG, attr) + +#define PTE1_LINK(pa) (((pa) & L1_C_ADDR_MASK) | L1_TYPE_C) + +/* + * Software defined bits for L2 descriptors + * - L2_AP0 is used as page accessed bit + * - L2_AP2 (RO / not RW) is used as page not modified bit + * - L2_TEX0 is used as software emulated RO bit + */ +#define PTE2_V L2_TYPE_S /* Valid bit */ +#define PTE2_A L2_AP0 /* Accessed - software emulated */ +#define PTE2_NM L2_AP2 /* not modified bit - software emulated + * used as real write enable bit */ +#define PTE2_M 0 /* Modified (dummy) */ +#define PTE2_S L2_SHARED /* Shared */ +#define PTE2_NG L2_NG /* Not global */ +#define PTE2_G 0 /* Global (dummy) */ +#define PTE2_NX L2_NX /* Not executable */ +#define PTE2_X 0 /* Not executable (dummy) */ +#define PTE2_RO L2_TEX1 /* Read Only */ +#define PTE2_U L2_AP1 /* User */ +#define PTE2_NU 0 /* Not user (kernel only) (dummy) */ +#define PTE2_W L2_TEX2 /* Wired */ + +#define PTE2_SHIFT L2_S_SHIFT +#define PTE2_SIZE L2_S_SIZE +#define PTE2_OFFSET L2_S_OFFSET +#define PTE2_FRAME L2_S_FRAME + +#define PTE2_ATTR_MASK (L2_TEX0 | L2_C | L2_B) + +#define PTE2_AP_KR (PTE2_RO | PTE2_NM) +#define PTE2_AP_KRW 0 +#define PTE2_AP_KRUR (PTE2_RO | PTE2_NM | PTE2_U) +#define PTE2_AP_KRWURW PTE2_U + +/* + * PTE2 descriptors creation macros. + */ +#define PTE2_PA(pa) ((pa) & PTE2_FRAME) +#define PTE2_AP_COMMON (PTE2_V | PTE2_S) + +#define PTE2(pa, ap, attr) (PTE2_PA(pa) | (ap) | (attr) | PTE2_AP_COMMON) + +#define PTE2_KERN(pa, ap, attr) PTE2(pa, (ap) | PTE2_A | PTE2_G, attr) +#define PTE2_KERN_NG(pa, ap, attr) PTE2(pa, (ap) | PTE2_A | PTE2_NG, attr) + + +// ----------------- TO BE DELETED --------------------------------------------- + +/* + * sys/arm/arm/elf_trampoline.c + */ +#define AP_KRW 0x01 /* kernel read/write */ + +/* + * lib/libkvm/kvm_arm.c + */ +#define L1_ADDR_MASK 0xfffffc00 + +/* + * lib/libkvm/kvm_arm.c + */ +#define L2_ADDR_BITS 0x000ff000 /* L2 PTE address bits */ + +#ifndef LOCORE +/* + * sys/arm/arm/minidump_machdep.c + * sys/arm/arm/pmap.c + * sys/arm/arm/pmap.h (hack for our hack in pmap.h ) + * lib/libkvm/kvm_arm.c + */ +typedef uint32_t pd_entry_t; /* page directory entry */ + +/* + * sys/arm/arm/minidump_machdep.c + * sys/arm/arm/pmap.c + * sys/arm/arm/pmap.h (hack for our hack in pmap.h ) + * sys/arm/include/param.h + */ +typedef uint32_t pt_entry_t; /* page table entry */ +#endif +// ----------------------------------------------------------------------------- + +#endif /* !_MACHINE_PTE_H_ */ diff --git a/sys/arm/include/pte.h b/sys/arm/include/pte.h index d12fc24bae4..ef804ae9808 100644 --- a/sys/arm/include/pte.h +++ b/sys/arm/include/pte.h @@ -33,6 +33,9 @@ * * $FreeBSD$ */ +#ifdef ARM_NEW_PMAP +#include +#else /* ARM_NEW_PMAP */ #ifndef _MACHINE_PTE_H_ #define _MACHINE_PTE_H_ @@ -352,5 +355,6 @@ typedef uint32_t pt_entry_t; /* page table entry */ * 1 X 1 1 1 Y Y WT Y Y */ #endif /* !_MACHINE_PTE_H_ */ +#endif /* !ARM_NEW_PMAP */ /* End of pte.h */ diff --git a/sys/arm/include/sf_buf.h b/sys/arm/include/sf_buf.h index b761cc70c69..75d5722445c 100644 --- a/sys/arm/include/sf_buf.h +++ b/sys/arm/include/sf_buf.h @@ -33,7 +33,11 @@ static inline void sf_buf_map(struct sf_buf *sf, int flags) { +#ifdef ARM_NEW_PMAP + pmap_qenter(sf->kva, &(sf->m), 1); +#else pmap_kenter(sf->kva, VM_PAGE_TO_PHYS(sf->m)); +#endif } static inline int diff --git a/sys/arm/include/smp.h b/sys/arm/include/smp.h index 3803674dded..1abe3988f1a 100644 --- a/sys/arm/include/smp.h +++ b/sys/arm/include/smp.h @@ -13,6 +13,8 @@ #define IPI_STOP_HARD 4 #define IPI_HARDCLOCK 6 #define IPI_TLB 7 +#define IPI_CACHE 8 +#define IPI_LAZYPMAP 9 void init_secondary(int cpu); void mpentry(void); diff --git a/sys/arm/include/vm.h b/sys/arm/include/vm.h index 6f27276fd81..5b2f125d126 100644 --- a/sys/arm/include/vm.h +++ b/sys/arm/include/vm.h @@ -29,8 +29,22 @@ #ifndef _MACHINE_VM_H_ #define _MACHINE_VM_H_ +#ifdef ARM_NEW_PMAP +#include + +#define VM_MEMATTR_WB_WA ((vm_memattr_t)PTE2_ATTR_WB_WA) +#define VM_MEMATTR_NOCACHE ((vm_memattr_t)PTE2_ATTR_NOCACHE) +#define VM_MEMATTR_DEVICE ((vm_memattr_t)PTE2_ATTR_DEVICE) +#define VM_MEMATTR_SO ((vm_memattr_t)PTE2_ATTR_SO) + +#define VM_MEMATTR_DEFAULT VM_MEMATTR_WB_WA +#define VM_MEMATTR_UNCACHEABLE VM_MEMATTR_SO /*name is misused by DMA */ + + +#else /* Memory attribute configuration. */ #define VM_MEMATTR_DEFAULT 0 #define VM_MEMATTR_UNCACHEABLE 1 +#endif #endif /* !_MACHINE_VM_H_ */ diff --git a/sys/arm/include/vmparam.h b/sys/arm/include/vmparam.h index 25e2c2496a3..91b6a63dd47 100644 --- a/sys/arm/include/vmparam.h +++ b/sys/arm/include/vmparam.h @@ -119,13 +119,10 @@ #define VM_LEVEL_0_ORDER 8 #endif -#define UPT_MAX_ADDRESS VADDR(UPTPTDI + 3, 0) -#define UPT_MIN_ADDRESS VADDR(UPTPTDI, 0) - #define VM_MIN_ADDRESS (0x00001000) #ifndef VM_MAXUSER_ADDRESS -#define VM_MAXUSER_ADDRESS KERNBASE -#endif /* VM_MAXUSER_ADDRESS */ +#define VM_MAXUSER_ADDRESS (KERNBASE - 0x00400000) /* !!! PT2MAP_SIZE */ +#endif #define VM_MAX_ADDRESS VM_MAXUSER_ADDRESS #define USRSTACK VM_MAXUSER_ADDRESS diff --git a/sys/conf/files.arm b/sys/conf/files.arm index 9556e502fcb..167b222a77b 100644 --- a/sys/conf/files.arm +++ b/sys/conf/files.arm @@ -42,7 +42,8 @@ arm/arm/pl310.c optional pl310 arm/arm/platform.c optional platform arm/arm/platform_if.m optional platform arm/arm/pmap.c optional !armv6 -arm/arm/pmap-v6.c optional armv6 +arm/arm/pmap-v6.c optional armv6 !arm_new_pmap +arm/arm/pmap-v6-new.c optional armv6 arm_new_pmap arm/arm/sc_machdep.c optional sc arm/arm/setcpsr.S standard arm/arm/setstack.s standard diff --git a/sys/conf/options.arm b/sys/conf/options.arm index aec06ae5390..5282489ee7b 100644 --- a/sys/conf/options.arm +++ b/sys/conf/options.arm @@ -6,6 +6,7 @@ ARM_KERN_DIRECTMAP opt_vm.h ARM_L2_PIPT opt_global.h ARM_MANY_BOARD opt_global.h ARM_NEW_PMAP opt_global.h +NKPT2PG opt_pmap.h ARM_WANT_TP_ADDRESS opt_global.h COUNTS_PER_SEC opt_timer.h CPU_ARM9 opt_global.h