Rework SLB trap handling so that double-faults into an SLB trap handler are

possible, and double faults within an SLB trap handler are not. The result
is that it possible to take an SLB fault at any time, on any address, for
any reason, at any point in the kernel.

This lets us do two important things. First, it removes the (soft) 16 GB RAM
ceiling on PPC64 as well as any architectural limitations on KVA space.
Second, it lets the kernel tolerate poorly designed hypervisors that
have a tendency to fail to restore the SLB properly after a hypervisor
context switch.

MFC after:	6 weeks
This commit is contained in:
Nathan Whitehorn 2012-01-15 00:08:14 +00:00
parent 5d48232408
commit ae09ab8f63
6 changed files with 235 additions and 58 deletions

View file

@ -238,6 +238,7 @@ extern void *trapcode64;
extern void *rstcode, *rstsize;
#endif
extern void *trapcode, *trapsize;
extern void *slbtrap, *slbtrapsize;
extern void *alitrap, *alisize;
extern void *dsitrap, *dsisize;
extern void *decrint, *decrsize;
@ -490,8 +491,8 @@ powerpc_init(vm_offset_t startkernel, vm_offset_t endkernel,
bcopy(&dsitrap, (void *)(EXC_DSI + trap_offset), (size_t)&dsisize);
bcopy(generictrap, (void *)EXC_ISI, (size_t)&trapsize);
#ifdef __powerpc64__
bcopy(generictrap, (void *)EXC_DSE, (size_t)&trapsize);
bcopy(generictrap, (void *)EXC_ISE, (size_t)&trapsize);
bcopy(&slbtrap, (void *)EXC_DSE, (size_t)&slbtrapsize);
bcopy(&slbtrap, (void *)EXC_ISE, (size_t)&slbtrapsize);
#endif
bcopy(generictrap, (void *)EXC_EXI, (size_t)&trapsize);
bcopy(generictrap, (void *)EXC_FPU, (size_t)&trapsize);

View file

@ -409,15 +409,11 @@ slb_alloc_tree(void)
/* Lock entries mapping kernel text and stacks */
#define SLB_SPILLABLE(slbe) \
(((slbe & SLBE_ESID_MASK) < VM_MIN_KERNEL_ADDRESS && \
(slbe & SLBE_ESID_MASK) > 16*SEGMENT_LENGTH) || \
(slbe & SLBE_ESID_MASK) > VM_MAX_KERNEL_ADDRESS)
void
slb_insert_kernel(uint64_t slbe, uint64_t slbv)
{
struct slb *slbcache;
int i, j;
int i;
/* We don't want to be preempted while modifying the kernel map */
critical_enter();
@ -437,15 +433,9 @@ slb_insert_kernel(uint64_t slbe, uint64_t slbv)
slbcache[USER_SLB_SLOT].slbe = 1;
}
for (i = mftb() % n_slbs, j = 0; j < n_slbs; j++, i = (i+1) % n_slbs) {
if (i == USER_SLB_SLOT)
continue;
if (SLB_SPILLABLE(slbcache[i].slbe))
break;
}
KASSERT(j < n_slbs, ("All kernel SLB slots locked!"));
i = mftb() % n_slbs;
if (i == USER_SLB_SLOT)
i = (i+1) % n_slbs;
fillkernslb:
KASSERT(i != USER_SLB_SLOT,

View file

@ -88,7 +88,9 @@ static int handle_onfault(struct trapframe *frame);
static void syscall(struct trapframe *frame);
#ifdef __powerpc64__
static int handle_slb_spill(pmap_t pm, vm_offset_t addr);
void handle_kernel_slb_spill(int, register_t, register_t);
static int handle_user_slb_spill(pmap_t pm, vm_offset_t addr);
extern int n_slbs;
#endif
int setfault(faultbuf); /* defined in locore.S */
@ -191,7 +193,7 @@ trap(struct trapframe *frame)
#ifdef __powerpc64__
case EXC_ISE:
case EXC_DSE:
if (handle_slb_spill(&p->p_vmspace->vm_pmap,
if (handle_user_slb_spill(&p->p_vmspace->vm_pmap,
(type == EXC_ISE) ? frame->srr0 :
frame->cpu.aim.dar) != 0)
sig = SIGSEGV;
@ -259,27 +261,20 @@ trap(struct trapframe *frame)
KASSERT(cold || td->td_ucred != NULL,
("kernel trap doesn't have ucred"));
switch (type) {
case EXC_DSI:
if (trap_pfault(frame, 0) == 0)
return;
break;
#ifdef __powerpc64__
case EXC_DSE:
if ((frame->cpu.aim.dar & SEGMENT_MASK) == USER_ADDR) {
__asm __volatile ("slbmte %0, %1" ::
"r"(td->td_pcb->pcb_cpu.aim.usr_vsid),
"r"(USER_SLB_SLBE));
"r"(td->td_pcb->pcb_cpu.aim.usr_vsid),
"r"(USER_SLB_SLBE));
return;
}
/* FALLTHROUGH */
case EXC_ISE:
if (handle_slb_spill(kernel_pmap,
(type == EXC_ISE) ? frame->srr0 :
frame->cpu.aim.dar) != 0)
panic("Fault handling kernel SLB miss");
return;
break;
#endif
case EXC_DSI:
if (trap_pfault(frame, 0) == 0)
return;
break;
case EXC_MCHK:
if (handle_onfault(frame))
return;
@ -326,8 +321,7 @@ printtrap(u_int vector, struct trapframe *frame, int isfatal, int user)
printf("%s %s trap:\n", isfatal ? "fatal" : "handled",
user ? "user" : "kernel");
printf("\n");
printf(" exception = 0x%x (%s)\n", vector >> 8,
trapname(vector));
printf(" exception = 0x%x (%s)\n", vector, trapname(vector));
switch (vector) {
case EXC_DSE:
case EXC_DSI:
@ -486,8 +480,54 @@ syscall(struct trapframe *frame)
}
#ifdef __powerpc64__
/* Handle kernel SLB faults -- runs in real mode, all seat belts off */
void
handle_kernel_slb_spill(int type, register_t dar, register_t srr0)
{
struct slb *slbcache;
uint64_t slbe, slbv;
uint64_t esid, addr;
int i;
addr = (type == EXC_ISE) ? srr0 : dar;
slbcache = PCPU_GET(slb);
esid = (uintptr_t)addr >> ADDR_SR_SHFT;
slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID;
/* See if the hardware flushed this somehow (can happen in LPARs) */
for (i = 0; i < n_slbs; i++)
if (slbcache[i].slbe == (slbe | (uint64_t)i))
return;
/* Not in the map, needs to actually be added */
slbv = kernel_va_to_slbv(addr);
if (slbcache[USER_SLB_SLOT].slbe == 0) {
for (i = 0; i < n_slbs; i++) {
if (i == USER_SLB_SLOT)
continue;
if (!(slbcache[i].slbe & SLBE_VALID))
goto fillkernslb;
}
if (i == n_slbs)
slbcache[USER_SLB_SLOT].slbe = 1;
}
/* Sacrifice a random SLB entry that is not the user entry */
i = mftb() % n_slbs;
if (i == USER_SLB_SLOT)
i = (i+1) % n_slbs;
fillkernslb:
/* Write new entry */
slbcache[i].slbv = slbv;
slbcache[i].slbe = slbe | (uint64_t)i;
/* Trap handler will restore from cache on exit */
}
static int
handle_slb_spill(pmap_t pm, vm_offset_t addr)
handle_user_slb_spill(pmap_t pm, vm_offset_t addr)
{
struct slb *user_entry;
uint64_t esid;
@ -495,12 +535,6 @@ handle_slb_spill(pmap_t pm, vm_offset_t addr)
esid = (uintptr_t)addr >> ADDR_SR_SHFT;
if (pm == kernel_pmap) {
slb_insert_kernel((esid << SLBE_ESID_SHIFT) | SLBE_VALID,
kernel_va_to_slbv(addr));
return (0);
}
PMAP_LOCK(pm);
user_entry = user_va_to_slb_entry(pm, addr);

View file

@ -112,6 +112,9 @@ restore_kernsrs:
* r31 scratch
* r1 kernel stack
* SRR0/1 as at start of trap
*
* NOTE: SPRG1 is never used while the MMU is on, making it safe to reuse
* in any real-mode fault handler, including those handling double faults.
*/
#define FRAME_SETUP(savearea) \
/* Have to enable translation to allow access of kernel stack: */ \
@ -120,11 +123,11 @@ restore_kernsrs:
std %r30,(savearea+CPUSAVE_SRR0)(%r31); /* save SRR0 */ \
mfsrr1 %r30; \
std %r30,(savearea+CPUSAVE_SRR1)(%r31); /* save SRR1 */ \
mfsprg1 %r31; /* get saved SP (clears SPRG1) */ \
mfmsr %r30; \
ori %r30,%r30,(PSL_DR|PSL_IR|PSL_RI)@l; /* relocation on */ \
mtmsr %r30; /* stack can now be accessed */ \
isync; \
mfsprg1 %r31; /* get saved SP */ \
stdu %r31,-(FRAMELEN+288)(%r1); /* save it in the callframe */ \
std %r0, FRAME_0+48(%r1); /* save r0 in the trapframe */ \
std %r31,FRAME_1+48(%r1); /* save SP " " */ \
@ -201,7 +204,7 @@ restore_kernsrs:
mtctr %r4; \
mtxer %r5; \
mtlr %r6; \
mtsprg1 %r7; /* save cr */ \
mtsprg2 %r7; /* save cr */ \
ld %r31,FRAME_31+48(%r1); /* restore r0-31 */ \
ld %r30,FRAME_30+48(%r1); \
ld %r29,FRAME_29+48(%r1); \
@ -235,16 +238,15 @@ restore_kernsrs:
ld %r0, FRAME_0+48(%r1); \
ld %r1, FRAME_1+48(%r1); \
/* Can't touch %r1 from here on */ \
mtsprg2 %r2; /* save r2 & r3 */ \
mtsprg3 %r3; \
mtsprg3 %r3; /* save r3 */ \
/* Disable translation, machine check and recoverability: */ \
mfmsr %r2; \
andi. %r2,%r2,~(PSL_DR|PSL_IR|PSL_ME|PSL_RI)@l; \
mtmsr %r2; \
mfmsr %r3; \
andi. %r3,%r3,~(PSL_DR|PSL_IR|PSL_ME|PSL_RI)@l; \
mtmsr %r3; \
isync; \
/* Decide whether we return to user mode: */ \
GET_CPUINFO(%r2); \
ld %r3,(savearea+CPUSAVE_SRR1)(%r2); \
GET_CPUINFO(%r3); \
ld %r3,(savearea+CPUSAVE_SRR1)(%r3); \
mtcr %r3; \
bf 17,1f; /* branch if PSL_PR is false */ \
/* Restore user SRs */ \
@ -262,15 +264,15 @@ restore_kernsrs:
ld %r29,(savearea+CPUSAVE_R29)(%r3); \
ld %r28,(savearea+CPUSAVE_R28)(%r3); \
ld %r27,(savearea+CPUSAVE_R27)(%r3); \
1: mfsprg1 %r2; /* restore cr */ \
mtcr %r2; \
GET_CPUINFO(%r2); \
ld %r3,(savearea+CPUSAVE_SRR0)(%r2); /* restore srr0 */ \
1: mfsprg2 %r3; /* restore cr */ \
mtcr %r3; \
GET_CPUINFO(%r3); \
ld %r3,(savearea+CPUSAVE_SRR0)(%r3); /* restore srr0 */ \
mtsrr0 %r3; \
ld %r3,(savearea+CPUSAVE_SRR1)(%r2); /* restore srr1 */ \
GET_CPUINFO(%r3); \
ld %r3,(savearea+CPUSAVE_SRR1)(%r3); /* restore srr1 */ \
mtsrr1 %r3; \
mfsprg2 %r2; /* restore r2 & r3 */ \
mfsprg3 %r3
mfsprg3 %r3 /* restore r3 */
#ifdef SMP
/*
@ -329,6 +331,151 @@ CNAME(trapcode):
bla generictrap /* LR & SPRG3 is exception # */
CNAME(trapsize) = .-CNAME(trapcode)
/*
* For SLB misses: do special things for the kernel
*
* Note: SPRG1 is always safe to overwrite any time the MMU is on, which is
* the only time this can be called.
*/
.globl CNAME(slbtrap),CNAME(slbtrapsize)
CNAME(slbtrap):
mtsprg1 %r1 /* save SP */
GET_CPUINFO(%r1)
std %r2,(PC_SLBSAVE+16)(%r1)
mfcr %r2 /* save CR */
std %r2,(PC_SLBSAVE+104)(%r1)
mfsrr1 %r2 /* test kernel mode */
mtcr %r2
bf 17,1f /* branch if PSL_PR is false */
/* User mode */
ld %r2,(PC_SLBSAVE+104)(%r1) /* Restore CR */
mtcr %r2
ld %r2,(PC_SLBSAVE+16)(%r1) /* Restore R2 */
mflr %r1 /* Save the old LR in r1 */
mtsprg2 %r1 /* And then in SPRG2 */
li %r1, 0x80 /* How to get the vector from LR */
bla generictrap /* LR & SPRG3 is exception # */
1: mflr %r2 /* Save the old LR in r2 */
bla kern_slbtrap
CNAME(slbtrapsize) = .-CNAME(slbtrap)
kern_slbtrap:
std %r2,(PC_SLBSAVE+136)(%r1) /* old LR */
std %r3,(PC_SLBSAVE+24)(%r1) /* save R3 */
/* Check if this needs to be handled as a regular trap (userseg miss) */
mflr %r2
andi. %r2,%r2,0xff80
cmpwi %r2,0x380
bne 1f
mfdar %r2
b 2f
1: mfsrr0 %r2
2: /* r2 now contains the fault address */
lis %r3,SEGMENT_MASK@highesta
ori %r3,%r3,SEGMENT_MASK@highera
sldi %r3,%r3,32
oris %r3,%r3,SEGMENT_MASK@ha
ori %r3,%r3,SEGMENT_MASK@l
and %r2,%r2,%r3 /* R2 = segment base address */
lis %r3,USER_ADDR@highesta
ori %r3,%r3,USER_ADDR@highera
sldi %r3,%r3,32
oris %r3,%r3,USER_ADDR@ha
ori %r3,%r3,USER_ADDR@l
cmpd %r2,%r3 /* Compare fault base to USER_ADDR */
bne 3f
/* User seg miss, handle as a regular trap */
ld %r2,(PC_SLBSAVE+104)(%r1) /* Restore CR */
mtcr %r2
ld %r2,(PC_SLBSAVE+16)(%r1) /* Restore R2,R3 */
ld %r3,(PC_SLBSAVE+24)(%r1)
ld %r1,(PC_SLBSAVE+136)(%r1) /* Save the old LR in r1 */
mtsprg2 %r1 /* And then in SPRG2 */
li %r1, 0x80 /* How to get the vector from LR */
b generictrap /* Retain old LR using b */
3: /* Real kernel SLB miss */
std %r0,(PC_SLBSAVE+0)(%r1) /* free all volatile regs */
mfsprg1 %r2 /* Old R1 */
std %r2,(PC_SLBSAVE+8)(%r1)
/* R2,R3 already saved */
std %r4,(PC_SLBSAVE+32)(%r1)
std %r5,(PC_SLBSAVE+40)(%r1)
std %r6,(PC_SLBSAVE+48)(%r1)
std %r7,(PC_SLBSAVE+56)(%r1)
std %r8,(PC_SLBSAVE+64)(%r1)
std %r9,(PC_SLBSAVE+72)(%r1)
std %r10,(PC_SLBSAVE+80)(%r1)
std %r11,(PC_SLBSAVE+88)(%r1)
std %r12,(PC_SLBSAVE+96)(%r1)
/* CR already saved */
mfxer %r2 /* save XER */
std %r2,(PC_SLBSAVE+112)(%r1)
mflr %r2 /* save LR (SP already saved) */
std %r2,(PC_SLBSAVE+120)(%r1)
mfctr %r2 /* save CTR */
std %r2,(PC_SLBSAVE+128)(%r1)
/* Call handler */
addi %r1,%r1,PC_SLBSTACK-48+1024
li %r2,~15
and %r1,%r1,%r2
lis %r3,tocbase@ha
ld %r2,tocbase@l(%r3)
mflr %r3
andi. %r3,%r3,0xff80
mfdar %r4
mfsrr0 %r5
bl handle_kernel_slb_spill
nop
/* Save r28-31, restore r4-r12 */
GET_CPUINFO(%r1)
ld %r4,(PC_SLBSAVE+32)(%r1)
ld %r5,(PC_SLBSAVE+40)(%r1)
ld %r6,(PC_SLBSAVE+48)(%r1)
ld %r7,(PC_SLBSAVE+56)(%r1)
ld %r8,(PC_SLBSAVE+64)(%r1)
ld %r9,(PC_SLBSAVE+72)(%r1)
ld %r10,(PC_SLBSAVE+80)(%r1)
ld %r11,(PC_SLBSAVE+88)(%r1)
ld %r12,(PC_SLBSAVE+96)(%r1)
std %r28,(PC_SLBSAVE+64)(%r1)
std %r29,(PC_SLBSAVE+72)(%r1)
std %r30,(PC_SLBSAVE+80)(%r1)
std %r31,(PC_SLBSAVE+88)(%r1)
/* Restore kernel mapping */
bl restore_kernsrs
/* Restore remaining registers */
ld %r28,(PC_SLBSAVE+64)(%r1)
ld %r29,(PC_SLBSAVE+72)(%r1)
ld %r30,(PC_SLBSAVE+80)(%r1)
ld %r31,(PC_SLBSAVE+88)(%r1)
ld %r2,(PC_SLBSAVE+104)(%r1)
mtcr %r2
ld %r2,(PC_SLBSAVE+112)(%r1)
mtxer %r2
ld %r2,(PC_SLBSAVE+120)(%r1)
mtlr %r2
ld %r2,(PC_SLBSAVE+128)(%r1)
mtctr %r2
ld %r2,(PC_SLBSAVE+136)(%r1)
mtlr %r2
/* Restore r0-r3 */
ld %r0,(PC_SLBSAVE+0)(%r1)
ld %r2,(PC_SLBSAVE+16)(%r1)
ld %r3,(PC_SLBSAVE+24)(%r1)
mfsprg1 %r1
/* Back to whatever we were doing */
rfid
/*
* For ALI: has to save DSISR and DAR
*/

View file

@ -55,7 +55,9 @@ struct pmap;
#define PCPU_MD_AIM64_FIELDS \
struct slb pc_slb[64]; \
struct slb **pc_userslb;
struct slb **pc_userslb; \
register_t pc_slbsave[18]; \
uint8_t pc_slbstack[1024];
#ifdef __powerpc64__
#define PCPU_MD_AIM_FIELDS PCPU_MD_AIM64_FIELDS

View file

@ -107,8 +107,11 @@ ASSYM(USER_ADDR, USER_ADDR);
#ifdef __powerpc64__
ASSYM(PC_KERNSLB, offsetof(struct pcpu, pc_slb));
ASSYM(PC_USERSLB, offsetof(struct pcpu, pc_userslb));
ASSYM(PC_SLBSAVE, offsetof(struct pcpu, pc_slbsave));
ASSYM(PC_SLBSTACK, offsetof(struct pcpu, pc_slbstack));
ASSYM(USER_SLB_SLOT, USER_SLB_SLOT);
ASSYM(USER_SLB_SLBE, USER_SLB_SLBE);
ASSYM(SEGMENT_MASK, SEGMENT_MASK);
#else
ASSYM(PM_SR, offsetof(struct pmap, pm_sr));
ASSYM(USER_SR, USER_SR);