opnsense-src/sys/compat/ia32/ia32_sysvec.c

236 lines
7 KiB
C
Raw Normal View History

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2002 Doug Rabson
* Copyright (c) 2003 Peter Wemm
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#define __ELF_WORD_SIZE 32
#include <sys/param.h>
#include <sys/exec.h>
#include <sys/fcntl.h>
#include <sys/imgact.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/mman.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/procfs.h>
#include <sys/resourcevar.h>
#include <sys/systm.h>
#include <sys/signalvar.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/syscall.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/vnode.h>
#include <sys/imgact_elf.h>
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <compat/freebsd32/freebsd32_signal.h>
#include <compat/freebsd32/freebsd32_util.h>
#include <compat/freebsd32/freebsd32_proto.h>
2003-12-10 18:16:32 -05:00
#include <compat/freebsd32/freebsd32_syscall.h>
#include <compat/ia32/ia32_signal.h>
#include <machine/frame.h>
#include <machine/md_var.h>
#include <machine/pcb.h>
#include <machine/cpufunc.h>
CTASSERT(sizeof(struct ia32_mcontext) == 640);
CTASSERT(sizeof(struct ia32_ucontext) == 704);
CTASSERT(sizeof(struct ia32_sigframe) == 800);
CTASSERT(sizeof(struct siginfo32) == 64);
#ifdef COMPAT_FREEBSD4
CTASSERT(sizeof(struct ia32_mcontext4) == 260);
CTASSERT(sizeof(struct ia32_ucontext4) == 324);
CTASSERT(sizeof(struct ia32_sigframe4) == 408);
#endif
Reorganize syscall entry and leave handling. Extend struct sysvec with three new elements: sv_fetch_syscall_args - the method to fetch syscall arguments from usermode into struct syscall_args. The structure is machine-depended (this might be reconsidered after all architectures are converted). sv_set_syscall_retval - the method to set a return value for usermode from the syscall. It is a generalization of cpu_set_syscall_retval(9) to allow ABIs to override the way to set a return value. sv_syscallnames - the table of syscall names. Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding the call to cpu_set_syscall_retval(). The new functions syscallenter(9) and syscallret(9) are provided that use sv_*syscall* pointers and contain the common repeated code from the syscall() implementations for the architecture-specific syscall trap handlers. Syscallenter() fetches arguments, calls syscall implementation from ABI sysent table, and set up return frame. The end of syscall bookkeeping is done by syscallret(). Take advantage of single place for MI syscall handling code and implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the thread is stopped at syscall entry or return point respectively. The EXEC flag augments SCX and notifies debugger that the process address space was changed by one of exec(2)-family syscalls. The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are changed to use syscallenter()/syscallret(). MIPS and arm are not converted and use the mostly unchanged syscall() implementation. Reviewed by: jhb, marcel, marius, nwhitehorn, stas Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc), stas (mips) MFC after: 1 month
2010-05-23 14:32:02 -04:00
extern const char *freebsd32_syscallnames[];
static SYSCTL_NODE(_compat, OID_AUTO, ia32, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"ia32 mode");
static u_long ia32_maxdsiz = IA32_MAXDSIZ;
SYSCTL_ULONG(_compat_ia32, OID_AUTO, maxdsiz, CTLFLAG_RWTUN, &ia32_maxdsiz, 0, "");
u_long ia32_maxssiz = IA32_MAXSSIZ;
SYSCTL_ULONG(_compat_ia32, OID_AUTO, maxssiz, CTLFLAG_RWTUN, &ia32_maxssiz, 0, "");
static u_long ia32_maxvmem = IA32_MAXVMEM;
SYSCTL_ULONG(_compat_ia32, OID_AUTO, maxvmem, CTLFLAG_RWTUN, &ia32_maxvmem, 0, "");
struct sysentvec ia32_freebsd_sysvec = {
.sv_size = FREEBSD32_SYS_MAXSYSCALL,
.sv_table = freebsd32_sysent,
.sv_errsize = 0,
.sv_errtbl = NULL,
.sv_transtrap = NULL,
.sv_fixup = elf32_freebsd_fixup,
.sv_sendsig = ia32_sendsig,
.sv_sigcode = ia32_sigcode,
.sv_szsigcode = &sz_ia32_sigcode,
.sv_name = "FreeBSD ELF32",
.sv_coredump = elf32_coredump,
.sv_imgact_try = NULL,
.sv_minsigstksz = MINSIGSTKSZ,
.sv_minuser = FREEBSD32_MINUSER,
.sv_maxuser = FREEBSD32_MAXUSER,
.sv_usrstack = FREEBSD32_USRSTACK,
.sv_psstrings = FREEBSD32_PS_STRINGS,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_auxargs = elf32_freebsd_copyout_auxargs,
.sv_copyout_strings = freebsd32_copyout_strings,
.sv_setregs = ia32_setregs,
.sv_fixlimit = ia32_fixlimit,
.sv_maxssiz = &ia32_maxssiz,
Implement Address Space Layout Randomization (ASLR) With this change, randomization can be enabled for all non-fixed mappings. It means that the base address for the mapping is selected with a guaranteed amount of entropy (bits). If the mapping was requested to be superpage aligned, the randomization honours the superpage attributes. Although the value of ASLR is diminshing over time as exploit authors work out simple ASLR bypass techniques, it elimintates the trivial exploitation of certain vulnerabilities, at least in theory. This implementation is relatively small and happens at the correct architectural level. Also, it is not expected to introduce regressions in existing cases when turned off (default for now), or cause any significant maintaince burden. The randomization is done on a best-effort basis - that is, the allocator falls back to a first fit strategy if fragmentation prevents entropy injection. It is trivial to implement a strong mode where failure to guarantee the requested amount of entropy results in mapping request failure, but I do not consider that to be usable. I have not fine-tuned the amount of entropy injected right now. It is only a quantitive change that will not change the implementation. The current amount is controlled by aslr_pages_rnd. To not spoil coalescing optimizations, to reduce the page table fragmentation inherent to ASLR, and to keep the transient superpage promotion for the malloced memory, locality clustering is implemented for anonymous private mappings, which are automatically grouped until fragmentation kicks in. The initial location for the anon group range is, of course, randomized. This is controlled by vm.cluster_anon, enabled by default. The default mode keeps the sbrk area unpopulated by other mappings, but this can be turned off, which gives much more breathing bits on architectures with small address space, such as i386. This is tied with the question of following an application's hint about the mmap(2) base address. Testing shows that ignoring the hint does not affect the function of common applications, but I would expect more demanding code could break. By default sbrk is preserved and mmap hints are satisfied, which can be changed by using the kern.elf{32,64}.aslr.honor_sbrk sysctl. ASLR is enabled on per-ABI basis, and currently it is only allowed on FreeBSD native i386 and amd64 (including compat 32bit) ABIs. Support for additional architectures will be added after further testing. Both per-process and per-image controls are implemented: - procctl(2) adds PROC_ASLR_CTL/PROC_ASLR_STATUS; - NT_FREEBSD_FCTL_ASLR_DISABLE feature control note bit makes it possible to force ASLR off for the given binary. (A tool to edit the feature control note is in development.) Global controls are: - kern.elf{32,64}.aslr.enable - for non-fixed mappings done by mmap(2); - kern.elf{32,64}.aslr.pie_enable - for PIE image activation mappings; - kern.elf{32,64}.aslr.honor_sbrk - allow to use sbrk area for mmap(2); - vm.cluster_anon - enables anon mapping clustering. PR: 208580 (exp runs) Exp-runs done by: antoine Reviewed by: markj (previous version) Discussed with: emaste Tested by: pho MFC after: 1 month Sponsored by: The FreeBSD Foundation Differential revision: https://reviews.freebsd.org/D5603
2019-02-10 12:19:45 -05:00
.sv_flags = SV_ABI_FREEBSD | SV_ASLR | SV_IA32 | SV_ILP32 |
SV_SHP | SV_TIMEKEEP,
Reorganize syscall entry and leave handling. Extend struct sysvec with three new elements: sv_fetch_syscall_args - the method to fetch syscall arguments from usermode into struct syscall_args. The structure is machine-depended (this might be reconsidered after all architectures are converted). sv_set_syscall_retval - the method to set a return value for usermode from the syscall. It is a generalization of cpu_set_syscall_retval(9) to allow ABIs to override the way to set a return value. sv_syscallnames - the table of syscall names. Use sv_set_syscall_retval in kern_sigsuspend() instead of hardcoding the call to cpu_set_syscall_retval(). The new functions syscallenter(9) and syscallret(9) are provided that use sv_*syscall* pointers and contain the common repeated code from the syscall() implementations for the architecture-specific syscall trap handlers. Syscallenter() fetches arguments, calls syscall implementation from ABI sysent table, and set up return frame. The end of syscall bookkeeping is done by syscallret(). Take advantage of single place for MI syscall handling code and implement ptrace_lwpinfo pl_flags PL_FLAG_SCE, PL_FLAG_SCX and PL_FLAG_EXEC. The SCE and SCX flags notify the debugger that the thread is stopped at syscall entry or return point respectively. The EXEC flag augments SCX and notifies debugger that the process address space was changed by one of exec(2)-family syscalls. The i386, amd64, sparc64, sun4v, powerpc and ia64 syscall()s are changed to use syscallenter()/syscallret(). MIPS and arm are not converted and use the mostly unchanged syscall() implementation. Reviewed by: jhb, marcel, marius, nwhitehorn, stas Tested by: marcel (ia64), marius (sparc64), nwhitehorn (powerpc), stas (mips) MFC after: 1 month
2010-05-23 14:32:02 -04:00
.sv_set_syscall_retval = ia32_set_syscall_retval,
.sv_fetch_syscall_args = ia32_fetch_syscall_args,
.sv_syscallnames = freebsd32_syscallnames,
.sv_shared_page_base = FREEBSD32_SHAREDPAGE,
.sv_shared_page_len = PAGE_SIZE,
.sv_schedtail = NULL,
.sv_thread_detach = NULL,
.sv_trap = NULL,
.sv_stackgap = elf32_stackgap,
};
INIT_SYSENTVEC(elf_ia32_sysvec, &ia32_freebsd_sysvec);
static Elf32_Brandinfo ia32_brand_info = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_386,
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/libexec/ld-elf.so.1",
.sysvec = &ia32_freebsd_sysvec,
.interp_newpath = "/libexec/ld-elf32.so.1",
.brand_note = &elf32_freebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
SYSINIT(ia32, SI_SUB_EXEC, SI_ORDER_MIDDLE,
(sysinit_cfunc_t) elf32_insert_brand_entry,
&ia32_brand_info);
static Elf32_Brandinfo ia32_brand_oinfo = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_386,
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/usr/libexec/ld-elf.so.1",
.sysvec = &ia32_freebsd_sysvec,
.interp_newpath = "/libexec/ld-elf32.so.1",
.brand_note = &elf32_freebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
SYSINIT(oia32, SI_SUB_EXEC, SI_ORDER_ANY,
(sysinit_cfunc_t) elf32_insert_brand_entry,
&ia32_brand_oinfo);
static Elf32_Brandinfo kia32_brand_info = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_386,
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/lib/ld.so.1",
.sysvec = &ia32_freebsd_sysvec,
.brand_note = &elf32_kfreebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
};
SYSINIT(kia32, SI_SUB_EXEC, SI_ORDER_ANY,
(sysinit_cfunc_t) elf32_insert_brand_entry,
&kia32_brand_info);
void
elf32_dump_thread(struct thread *td, void *dst, size_t *off)
{
void *buf;
size_t len;
len = 0;
if (use_xsave) {
if (dst != NULL) {
fpugetregs(td);
len += elf32_populate_note(NT_X86_XSTATE,
get_pcb_user_save_td(td), dst,
cpu_max_ext_state_size, &buf);
*(uint64_t *)((char *)buf + X86_XSTATE_XCR0_OFFSET) =
xsave_mask;
} else
len += elf32_populate_note(NT_X86_XSTATE, NULL, NULL,
cpu_max_ext_state_size, NULL);
}
*off = len;
}
void
ia32_fixlimit(struct rlimit *rl, int which)
{
switch (which) {
case RLIMIT_DATA:
if (ia32_maxdsiz != 0) {
if (rl->rlim_cur > ia32_maxdsiz)
rl->rlim_cur = ia32_maxdsiz;
if (rl->rlim_max > ia32_maxdsiz)
rl->rlim_max = ia32_maxdsiz;
}
break;
case RLIMIT_STACK:
if (ia32_maxssiz != 0) {
if (rl->rlim_cur > ia32_maxssiz)
rl->rlim_cur = ia32_maxssiz;
if (rl->rlim_max > ia32_maxssiz)
rl->rlim_max = ia32_maxssiz;
}
break;
case RLIMIT_VMEM:
if (ia32_maxvmem != 0) {
if (rl->rlim_cur > ia32_maxvmem)
rl->rlim_cur = ia32_maxvmem;
if (rl->rlim_max > ia32_maxvmem)
rl->rlim_max = ia32_maxvmem;
}
break;
}
}