mirror of
https://github.com/opnsense/src.git
synced 2026-02-20 16:30:53 -05:00
Implement the linux_io_* syscalls (AIO). They are only enabled if the native
AIO code is available (either compiled in to the kernel or as a module) at
the time the functions are used. If the AIO stuff is not available there
will be a ENOSYS.
From the submitter:
---snip---
DESIGN NOTES:
1. Linux permits a process to own multiple AIO queues (distinguished by
"context"), but FreeBSD creates only one single AIO queue per process.
My code maintains a request queue (STAILQ of queue(3)) per "context",
and throws all AIO requests of all contexts owned by a process into
the single FreeBSD per-process AIO queue.
When the process calls io_destroy(2), io_getevents(2), io_submit(2) and
io_cancel(2), my code can pick out requests owned by the specified context
from the single FreeBSD per-process AIO queue according to the per-context
request queues maintained by my code.
2. The request queue maintained by my code stores contrast information between
Linux IO control blocks (struct linux_iocb) and FreeBSD IO control blocks
(struct aiocb). FreeBSD IO control block actually exists in userland memory
space, required by FreeBSD native aio_XXXXXX(2).
3. It is quite troubling that the function io_getevents() of libaio-0.3.105
needs to use Linux-specific "struct aio_ring", which is a partial mirror
of context in user space. I would rather take the address of context in
kernel as the context ID, but the io_getevents() of libaio forces me to
take the address of the "ring" in user space as the context ID.
To my surprise, one comment line in the file "io_getevents.c" of
libaio-0.3.105 reads:
Ben will hate me for this
REFERENCE:
1. Linux kernel source code: http://www.kernel.org/pub/linux/kernel/v2.6/
(include/linux/aio_abi.h, fs/aio.c)
2. Linux manual pages: http://www.kernel.org/pub/linux/docs/manpages/
(io_setup(2), io_destroy(2), io_getevents(2), io_submit(2), io_cancel(2))
3. Linux Scalability Effort: http://lse.sourceforge.net/io/aio.html
The design notes: http://lse.sourceforge.net/io/aionotes.txt
4. The package libaio, both source and binary:
http://rpmfind.net/linux/rpm2html/search.php?query=libaio
Simple transparent interface to Linux AIO system calls.
5. Libaio-oracle: http://oss.oracle.com/projects/libaio-oracle/
POSIX AIO implementation based on Linux AIO system calls (depending on
libaio).
---snip---
Submitted by: Li, Xiao <intron@intron.ac>
1349 lines
37 KiB
C
1349 lines
37 KiB
C
/*-
|
|
* Copyright (c) 2006 Li, Xiao <intron@intron.ac>. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include "opt_compat.h"
|
|
|
|
#include <sys/types.h>
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/module.h>
|
|
#include <sys/sysent.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/errno.h>
|
|
#include <sys/eventhandler.h>
|
|
#include <sys/aio.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/time.h>
|
|
#include <sys/queue.h>
|
|
#include <vm/uma.h>
|
|
#include <sys/lock.h>
|
|
#include <sys/mutex.h>
|
|
#include <sys/sx.h>
|
|
#include <sys/linker.h>
|
|
#include <sys/sysctl.h>
|
|
#include <sys/syscall.h>
|
|
#include <sys/sysproto.h>
|
|
|
|
#ifdef COMPAT_LINUX32
|
|
#include <machine/../linux32/linux.h>
|
|
#include <machine/../linux32/linux32_proto.h>
|
|
#else
|
|
#include <machine/../linux/linux.h>
|
|
#include <machine/../linux/linux_proto.h>
|
|
#endif
|
|
|
|
#define LINUX_AIO_DEBUG
|
|
|
|
/*
|
|
* Linux Kernel Implementation of Asynchronous I/O
|
|
*/
|
|
|
|
#ifdef LINUX_AIO_DEBUG
|
|
|
|
/* Print arguments of syscall */
|
|
#define DARGPRINTF(fmt, ...) printf("linux(%ld): %s("fmt")\n", \
|
|
(long)td->td_proc->p_pid, __func__, __VA_ARGS__)
|
|
/* Print message in syscall function */
|
|
#define DPRINTF(fmt, ...) printf(LMSG("%s(): " fmt), \
|
|
__func__, __VA_ARGS__)
|
|
/* Print message in non-syscall function, the one more "P" means "private" */
|
|
#define DPPRINTF(fmt, ...) printf("linux(): %s(): " fmt "\n", \
|
|
__func__, __VA_ARGS__)
|
|
|
|
#else
|
|
|
|
#define DARGPRINTF(fmt, ...)
|
|
#define DPRINTF(fmt, ...)
|
|
#define DPPRINTF(fmt, ...)
|
|
|
|
#endif
|
|
|
|
/*
|
|
* DATA STRUCTURE HIERARCHY
|
|
*
|
|
* +--------------------+ +--------------------+
|
|
* context_list ---> | context | ---> | context | ---> ...
|
|
* SLIST |(owned by a process)| |(owned by a process)|
|
|
* | | | |
|
|
* | ctx_req | | ctx_req |
|
|
* +----|---------------+ +----|---------------+
|
|
* | STAILQ | STAILQ
|
|
* v v
|
|
* +------------+ +------------+
|
|
* | request | | request |
|
|
* | | | |
|
|
* |.req_pbsd | |.req_pbsd |
|
|
* |.req_porig | |.req_porig |
|
|
* |.req_linux | |.req_linux |
|
|
* | | | |
|
|
* +------------+ +------------+
|
|
* | |
|
|
* v v
|
|
* +------------+ +------------+
|
|
* | request | | request |
|
|
* | | | |
|
|
* |.req_pbsd | |.req_pbsd |
|
|
* |.req_porig | |.req_porig |
|
|
* |.req_linux | |.req_linux |
|
|
* | | | |
|
|
* +------------+ +------------+
|
|
* | |
|
|
* v v
|
|
* ... ...
|
|
*/
|
|
|
|
struct linux_aio_context;
|
|
|
|
struct linux_aio_request {
|
|
struct aiocb *req_pbsd; /* Userland clone for FreeBSD */
|
|
struct linux_iocb *req_porig; /* Userland original control block */
|
|
struct linux_iocb req_linux; /* Copy of original control block */
|
|
STAILQ_ENTRY(linux_aio_request) req_ctx_entry;
|
|
};
|
|
|
|
struct linux_aio_context {
|
|
struct sx ctx_sx;
|
|
pid_t ctx_pid;
|
|
struct linux_aio_ring *ctx_pring;
|
|
int ctx_nreq_max; /* Maximum request number */
|
|
int ctx_nreq_cur; /* Current request number */
|
|
STAILQ_HEAD(,linux_aio_request) ctx_req;
|
|
SLIST_ENTRY(linux_aio_context) ctx_list_entry;
|
|
};
|
|
static SLIST_HEAD(,linux_aio_context) linux_aio_context_list;
|
|
|
|
#define LINUX_AIO_REQ_HOOK(pctx, preq) { \
|
|
STAILQ_INSERT_TAIL(&((pctx)->ctx_req), (preq), req_ctx_entry); \
|
|
(pctx)->ctx_nreq_cur ++; \
|
|
}
|
|
|
|
#define LINUX_AIO_REQ_UNHOOK(pctx, preq) { \
|
|
STAILQ_REMOVE(&((pctx)->ctx_req), (preq), linux_aio_request, \
|
|
req_ctx_entry); \
|
|
(pctx)->ctx_nreq_cur --; \
|
|
}
|
|
|
|
#define LINUX_AIO_REQ_FOREACH(pctx, preq) \
|
|
STAILQ_FOREACH((preq), &((pctx)->ctx_req), req_ctx_entry)
|
|
|
|
#define LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) \
|
|
STAILQ_FOREACH_SAFE((preq), &((pctx)->ctx_req), req_ctx_entry, \
|
|
(ptmpreq))
|
|
|
|
#define LINUX_AIO_CTX_LOCK(pctx) sx_xlock(&((pctx)->ctx_sx))
|
|
|
|
#define LINUX_AIO_CTX_UNLOCK(pctx) sx_unlock(&((pctx)->ctx_sx))
|
|
|
|
#define LINUX_AIO_CTX_HOOK(pctx) \
|
|
SLIST_INSERT_HEAD(&linux_aio_context_list, (pctx), ctx_list_entry)
|
|
|
|
#define LINUX_AIO_CTX_UNHOOK(pctx) \
|
|
SLIST_REMOVE(&linux_aio_context_list, (pctx), \
|
|
linux_aio_context, ctx_list_entry)
|
|
|
|
#define LINUX_AIO_CTX_FOREACH(pctx) \
|
|
SLIST_FOREACH((pctx), &linux_aio_context_list, ctx_list_entry)
|
|
|
|
#define LINUX_AIO_CTX_FOREACH_SAFE(pctx, ptmpctx) \
|
|
SLIST_FOREACH_SAFE((pctx), &linux_aio_context_list, \
|
|
ctx_list_entry, (ptmpctx))
|
|
|
|
#define LINUX_AIO_CTX_MATCH(pctx, ctxid, pid) \
|
|
((linux_aio_context_t)(pctx)->ctx_pring == (ctxid) \
|
|
&& (pctx)->ctx_pid == (pid))
|
|
|
|
static struct mtx linux_aio_context_list_mtx;
|
|
|
|
#define LINUX_AIO_CTX_LIST_LOCK() mtx_lock(&linux_aio_context_list_mtx)
|
|
|
|
#define LINUX_AIO_CTX_LIST_UNLOCK() mtx_unlock(&linux_aio_context_list_mtx)
|
|
|
|
/*
|
|
* The following two macros are substantially identical to the two macros
|
|
* AIO_(UN)LOCK in /sys/kern/vfs_aio.c. Thus, the mutex much be unlocked
|
|
* before calling functions of FreeBSD native AIO module.
|
|
*
|
|
* XXX
|
|
* I ASSUME the member "kaio_mtx" is the first element of "struct kaioinfo".
|
|
*/
|
|
#define LINUX_AIO_LOCK(p) { \
|
|
if ((p)->p_aioinfo == NULL) \
|
|
p_aio_init_aioinfo(p); \
|
|
mtx_lock((struct mtx *)((p)->p_aioinfo)); \
|
|
}
|
|
|
|
#define LINUX_AIO_UNLOCK(p) { \
|
|
if ((p)->p_aioinfo == NULL) \
|
|
p_aio_init_aioinfo(p); \
|
|
mtx_unlock((struct mtx *)((p)->p_aioinfo)); \
|
|
}
|
|
|
|
static uma_zone_t linux_aio_context_zone, linux_aio_request_zone;
|
|
|
|
static eventhandler_tag linux_aio_exit_tag;
|
|
|
|
/*
|
|
* XXX
|
|
* Calling external function/variable declared with "static" is DANGEROUS !!!
|
|
* Compiler may use register to transfer calling arguments for optimization,
|
|
* which is NOT a normal calling way and can cause kernel crash.
|
|
*/
|
|
|
|
#define NATIVE_AIO_MODULE_NAME "aio"
|
|
static struct mod_depend native_aio_module_depend = {1, 1, 1};
|
|
static linker_file_t native_aio_module_handle = NULL;
|
|
|
|
/* Mirror of sysctls in /sys/kern/vfs_aio.c */
|
|
#define NATIVE_AIO_SYSCTL_CAPACITY_PROC "vfs.aio.max_aio_queue_per_proc"
|
|
static int native_aio_capacity_proc;
|
|
#define NATIVE_AIO_SYSCTL_CAPACITY_SYS "vfs.aio.max_aio_queue"
|
|
static int native_aio_capacity_sys;
|
|
|
|
/* For declaration of aio_aqueue(), defined in /sys/kern/vfs_aio.c */
|
|
struct aioliojob;
|
|
|
|
/* Functions in /sys/kern/vfs_aio.c, XXX defined with "static" */
|
|
#define GET_INTERNAL_FUNC_POINTER(s) { \
|
|
* ((caddr_t *) & p_ ## s) = linker_file_lookup_symbol( \
|
|
native_aio_module_handle, #s, FALSE); \
|
|
if (p_ ## s == NULL) \
|
|
break; \
|
|
}
|
|
static void (*p_aio_init_aioinfo) (struct proc *p);
|
|
static int (*p_aio_aqueue) (struct thread *td, struct aiocb *job,
|
|
struct aioliojob *lio, int type, int osigev);
|
|
|
|
/* System calls in /sys/kern/vfs_aio.c */
|
|
#define DEFINE_SYSCALL_POINTER_VARIABLE(s) \
|
|
static int (* p_ ## s) (struct thread *, struct s ## _args *)
|
|
#define GET_SYSCALL_POINTER(s) { \
|
|
* ((sy_call_t **) & p_ ## s) = sysent[SYS_ ## s].sy_call; \
|
|
if ((sy_call_t *) p_ ## s == (sy_call_t *)lkmressys) \
|
|
break; \
|
|
}
|
|
DEFINE_SYSCALL_POINTER_VARIABLE(aio_return);
|
|
DEFINE_SYSCALL_POINTER_VARIABLE(aio_suspend);
|
|
DEFINE_SYSCALL_POINTER_VARIABLE(aio_cancel);
|
|
DEFINE_SYSCALL_POINTER_VARIABLE(aio_error);
|
|
|
|
static int user_mem_rw_verify(void *p, size_t s)
|
|
{
|
|
char buf[256];
|
|
size_t i;
|
|
int nerr = 0;
|
|
|
|
for (i = 0; i < s; i += sizeof(buf)) {
|
|
/* Verify reading */
|
|
nerr = copyin((char *)p+i, buf, MIN(sizeof(buf), s-i));
|
|
if (nerr != 0)
|
|
break;
|
|
|
|
/* Verify writing */
|
|
nerr = copyout(buf, (char *)p+i, MIN(sizeof(buf), s-i));
|
|
if (nerr != 0)
|
|
break;
|
|
}
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
/* Allocate memory in user space */
|
|
static int user_malloc(struct thread *td, void **pp, size_t s)
|
|
{
|
|
struct mmap_args mmaparg;
|
|
int nerr;
|
|
register_t r;
|
|
|
|
r = td->td_retval[0];
|
|
|
|
mmaparg.addr = NULL;
|
|
mmaparg.len = s;
|
|
mmaparg.prot = PROT_READ | PROT_WRITE;
|
|
mmaparg.flags = MAP_PRIVATE | MAP_ANON;
|
|
mmaparg.fd = -1;
|
|
mmaparg.pad = 0;
|
|
mmaparg.pos = 0;
|
|
|
|
nerr = mmap(td, &mmaparg);
|
|
|
|
if (nerr == 0) {
|
|
*pp = (void *)td->td_retval[0];
|
|
DPPRINTF("%lu bytes allocated at %p", (unsigned long)s, *pp);
|
|
}
|
|
|
|
td->td_retval[0] = r;
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
/* Free memory in user space */
|
|
static int user_free(struct thread *td, void *p, size_t s)
|
|
{
|
|
struct munmap_args munmaparg;
|
|
int nerr;
|
|
register_t r;
|
|
|
|
r = td->td_retval[0];
|
|
|
|
munmaparg.addr = p;
|
|
munmaparg.len = s;
|
|
|
|
nerr = munmap(td, &munmaparg);
|
|
|
|
td->td_retval[0] = r;
|
|
DPPRINTF("%lu bytes at %p", (unsigned long)s, p);
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
#ifdef LINUX_AIO_DEBUG
|
|
|
|
static void linux_aio_dump_freebsd_aiocb(struct aiocb *piocb, int isuserland)
|
|
{
|
|
struct aiocb localcb, *pcb;
|
|
int nerr = 0;
|
|
|
|
if (isuserland) {
|
|
nerr = copyin(piocb, &localcb, sizeof(localcb));
|
|
pcb = &localcb;
|
|
} else
|
|
pcb = piocb;
|
|
|
|
DPPRINTF("Dump struct aiocb (%p, %s): %s",
|
|
piocb, (isuserland?"userland":"kernel"),
|
|
(nerr?"Failure":""));
|
|
if (!nerr) {
|
|
DPPRINTF("aio_fildes: %d",
|
|
pcb->aio_fildes);
|
|
DPPRINTF("aio_offset: %lu",
|
|
(unsigned long) pcb->aio_offset);
|
|
DPPRINTF("aio_buf: %p",
|
|
pcb->aio_buf);
|
|
DPPRINTF("aio_nbytes: %lu",
|
|
(unsigned long) pcb->aio_nbytes);
|
|
DPPRINTF("aio_lio_opcode: %d",
|
|
pcb->aio_lio_opcode);
|
|
DPPRINTF("aio_reqprio: %d",
|
|
pcb->aio_reqprio);
|
|
DPPRINTF("aio_sigevent.sigev_notify: %d",
|
|
pcb->aio_sigevent.sigev_notify);
|
|
DPPRINTF("aio_sigevent.sigev_signo: %d",
|
|
pcb->aio_sigevent.sigev_signo);
|
|
}
|
|
}
|
|
|
|
#define DUMP_FREEBSD_AIOCB(p, isu) linux_aio_dump_freebsd_aiocb((p), (isu));
|
|
|
|
#define DUMP_TIMESPEC(f, t ,a) \
|
|
DPRINTF("%s%ld second + %ld nanosecond%s", \
|
|
(f), (long)(t)->tv_sec, (long)(t)->tv_nsec, (a));
|
|
|
|
#else /* ! LINUX_AIO_DEBUG */
|
|
|
|
#define DUMP_FREEBSD_AIOCB(p, isu)
|
|
#define DUMP_TIMESPEC(f, t, a)
|
|
|
|
#endif /* LINUX_AIO_DEBUG */
|
|
|
|
static int iocb_reformat(struct linux_iocb *plnx, struct aiocb *pbsd)
|
|
{
|
|
int nerr = 0;
|
|
|
|
bzero(pbsd, sizeof(*pbsd));
|
|
|
|
pbsd->aio_fildes = plnx->aio_fildes; /* File descriptor */
|
|
pbsd->aio_offset = plnx->aio_offset; /* File offset for I/O */
|
|
pbsd->aio_buf = (void *)(unsigned long) plnx->aio_buf; /*
|
|
* User space
|
|
* I/O buffer
|
|
*/
|
|
pbsd->aio_nbytes = plnx->aio_nbytes; /* Number of bytes for I/O */
|
|
switch (plnx->aio_lio_opcode) { /* LIO opcode */
|
|
case LINUX_IOCB_CMD_PREAD:
|
|
pbsd->aio_lio_opcode = LIO_READ;
|
|
break;
|
|
case LINUX_IOCB_CMD_PWRITE:
|
|
pbsd->aio_lio_opcode = LIO_WRITE;
|
|
break;
|
|
case LINUX_IOCB_CMD_FSYNC:
|
|
case LINUX_IOCB_CMD_FDSYNC:
|
|
pbsd->aio_lio_opcode = LIO_SYNC;
|
|
break;
|
|
#if 0
|
|
case LINUX_IOCB_CMD_PREADX:
|
|
break;
|
|
case LINUX_IOCB_CMD_POLL:
|
|
break;
|
|
#endif
|
|
case LINUX_IOCB_CMD_NOOP:
|
|
pbsd->aio_lio_opcode = LIO_NOP;
|
|
break;
|
|
default:
|
|
nerr = EINVAL;
|
|
break;
|
|
}
|
|
if (nerr != 0) {
|
|
DPPRINTF("Unsupported aio_lio_opcode: %u",
|
|
(unsigned)plnx->aio_lio_opcode);
|
|
return (nerr);
|
|
}
|
|
pbsd->aio_reqprio = plnx->aio_reqprio; /* Request priority */
|
|
pbsd->aio_sigevent.sigev_notify = SIGEV_NONE; /* No signal to deliver */
|
|
pbsd->aio_sigevent.sigev_signo = 0; /* No signal to deliver */
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
static int link_to_native_aio_module(struct thread *td)
|
|
{
|
|
int nerr;
|
|
|
|
if (native_aio_module_handle != NULL) {
|
|
/* Linking has been done successfully. */
|
|
return (0);
|
|
}
|
|
|
|
nerr = linker_reference_module(NATIVE_AIO_MODULE_NAME,
|
|
&native_aio_module_depend, &native_aio_module_handle);
|
|
if (nerr)
|
|
return (nerr);
|
|
|
|
do {
|
|
nerr = EINVAL;
|
|
|
|
/* Kernel internal functions */
|
|
GET_INTERNAL_FUNC_POINTER(aio_init_aioinfo);
|
|
GET_INTERNAL_FUNC_POINTER(aio_aqueue);
|
|
|
|
/* System calls */
|
|
GET_SYSCALL_POINTER(aio_return);
|
|
GET_SYSCALL_POINTER(aio_suspend);
|
|
GET_SYSCALL_POINTER(aio_cancel);
|
|
GET_SYSCALL_POINTER(aio_error);
|
|
|
|
nerr = 0;
|
|
} while (0);
|
|
|
|
if (nerr) {
|
|
linker_release_module(NULL, NULL, native_aio_module_handle);
|
|
native_aio_module_handle = NULL;
|
|
|
|
printf(LMSG("Unable to link to the native module \""
|
|
NATIVE_AIO_MODULE_NAME "\"."));
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
#define LINK_TO_NATIVE_AIO_MODULE() \
|
|
if (link_to_native_aio_module(td)) { \
|
|
printf(LMSG("Please load the module \"" \
|
|
NATIVE_AIO_MODULE_NAME "\"" \
|
|
"to provide FreeBSD " \
|
|
"native Asynchronous I/O support.")); \
|
|
return (ENOSYS); \
|
|
}
|
|
|
|
static int mirror_native_aio_sysctl(struct thread *td)
|
|
{
|
|
int nerr = 0;
|
|
size_t l;
|
|
|
|
l = sizeof(native_aio_capacity_proc);
|
|
nerr = kernel_sysctlbyname(td, NATIVE_AIO_SYSCTL_CAPACITY_PROC,
|
|
&native_aio_capacity_proc, &l, NULL, 0,
|
|
NULL ,0);
|
|
if (nerr)
|
|
return (nerr);
|
|
|
|
l = sizeof(native_aio_capacity_sys);
|
|
nerr = kernel_sysctlbyname(td, NATIVE_AIO_SYSCTL_CAPACITY_SYS,
|
|
&native_aio_capacity_sys, &l, NULL, 0,
|
|
NULL ,0);
|
|
if (nerr)
|
|
return (nerr);
|
|
|
|
DPRINTF(NATIVE_AIO_SYSCTL_CAPACITY_PROC "=%d, "
|
|
NATIVE_AIO_SYSCTL_CAPACITY_SYS "=%d",
|
|
native_aio_capacity_proc,
|
|
native_aio_capacity_sys);
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
/* Linux system call io_setup(2) */
|
|
int linux_io_setup(struct thread *td, struct linux_io_setup_args *args)
|
|
{
|
|
struct proc *p;
|
|
struct linux_aio_ring *pring, ring;
|
|
struct linux_aio_context *pctx = NULL, *ptmpctx;
|
|
linux_aio_context_t ctx_id;
|
|
int nerr = 0, nr, nrall, nq, arg_nr_reqs;
|
|
|
|
DARGPRINTF("%u, %p", args->nr_reqs, args->ctxp);
|
|
LINK_TO_NATIVE_AIO_MODULE();
|
|
nerr = mirror_native_aio_sysctl(td);
|
|
if (nerr) {
|
|
printf(LMSG("linux_io_setup(): Unable to query sysctls "
|
|
NATIVE_AIO_SYSCTL_CAPACITY_PROC
|
|
" and/or " NATIVE_AIO_SYSCTL_CAPACITY_SYS
|
|
" ."));
|
|
return (nerr);
|
|
}
|
|
|
|
/* Signed integer is a little safer than unsigned */
|
|
arg_nr_reqs = args->nr_reqs;
|
|
if (arg_nr_reqs <= 0)
|
|
return (EINVAL);
|
|
|
|
if (arg_nr_reqs > native_aio_capacity_proc
|
|
|| arg_nr_reqs > native_aio_capacity_sys) {
|
|
printf(LMSG("linux_io_setup(): Please increase sysctls "
|
|
NATIVE_AIO_SYSCTL_CAPACITY_PROC
|
|
" and/or " NATIVE_AIO_SYSCTL_CAPACITY_SYS
|
|
" ."));
|
|
return (ENOMEM);
|
|
}
|
|
|
|
nerr = user_mem_rw_verify(args->ctxp, sizeof(*(args->ctxp)));
|
|
if (nerr != 0)
|
|
return (nerr);
|
|
|
|
copyin(args->ctxp, &ctx_id, sizeof(ctx_id));
|
|
if (ctx_id != 0) /* "Not initialized", described by io_setup(2) */
|
|
return (EINVAL);
|
|
|
|
p = td->td_proc;
|
|
|
|
/* Get a new "ring" */
|
|
nerr = user_malloc(td, (void **)&pring, sizeof(*pring));
|
|
if (nerr != 0)
|
|
return (nerr);
|
|
|
|
/* Get a new context */
|
|
pctx = uma_zalloc(linux_aio_context_zone, M_WAITOK);
|
|
|
|
LINUX_AIO_CTX_LIST_LOCK();
|
|
|
|
/* Count request capacity of all contexts belonging to this process */
|
|
nr = 0;
|
|
nrall = 0;
|
|
nq = 0;
|
|
LINUX_AIO_CTX_FOREACH(ptmpctx) {
|
|
if (ptmpctx->ctx_pid == p->p_pid) {
|
|
nr += ptmpctx->ctx_nreq_max;
|
|
nq ++;
|
|
}
|
|
nrall += ptmpctx->ctx_nreq_max;
|
|
}
|
|
DPRINTF("%d queues of %d requests totally allocated for this process, "
|
|
"%d requests' total capacity for the whole system",
|
|
nq, nr, nrall);
|
|
|
|
/* Check whether there are enough resources for requested queue */
|
|
if (arg_nr_reqs > native_aio_capacity_proc - nr
|
|
|| arg_nr_reqs > native_aio_capacity_sys - nrall) {
|
|
printf(LMSG("linux_io_setup(): "
|
|
"Please increase sysctls "
|
|
NATIVE_AIO_SYSCTL_CAPACITY_PROC
|
|
" and/or " NATIVE_AIO_SYSCTL_CAPACITY_SYS " ."
|
|
"Besides %d queues of %d requests totally "
|
|
"for this process, and %d requests' queues "
|
|
"totally for the whole system, "
|
|
"this Linux application needs one more "
|
|
"AIO queue of %d requests' capacity."),
|
|
nq, nr, nrall, arg_nr_reqs);
|
|
LINUX_AIO_CTX_LIST_UNLOCK();
|
|
DPRINTF("Free context %p", pctx);
|
|
uma_zfree(linux_aio_context_zone, pctx);
|
|
user_free(td, pring, sizeof(*pring));
|
|
return (ENOMEM);
|
|
}
|
|
|
|
/* Initialize the new context */
|
|
sx_init(&(pctx->ctx_sx), "linux_aio_context");
|
|
pctx->ctx_pid = p->p_pid;
|
|
pctx->ctx_pring = pring;
|
|
pctx->ctx_nreq_max = arg_nr_reqs;
|
|
pctx->ctx_nreq_cur = 0;
|
|
STAILQ_INIT(&(pctx->ctx_req));
|
|
|
|
/* Hook the new context to global context list */
|
|
LINUX_AIO_CTX_HOOK(pctx);
|
|
|
|
LINUX_AIO_CTX_LIST_UNLOCK();
|
|
|
|
/* Initialize the new "ring" */
|
|
DPRINTF("initialize the \"ring\" %p", pring);
|
|
bzero(&ring, sizeof(ring));
|
|
ring.ring_id = 1;
|
|
ring.ring_nr = arg_nr_reqs;
|
|
ring.ring_head = 0;
|
|
ring.ring_tail = 1;
|
|
ring.ring_magic = LINUX_AIO_RING_MAGIC;
|
|
ring.ring_compat_features = LINUX_AIO_RING_COMPAT_FEATURES;
|
|
ring.ring_incompat_features = LINUX_AIO_RING_INCOMPAT_FEATURES;
|
|
ring.ring_header_length = sizeof(ring);
|
|
copyout(&ring, pring, sizeof(ring)); /* It has been hooked before */
|
|
|
|
/* Substantial return value */
|
|
ctx_id = (linux_aio_context_t)pctx->ctx_pring;
|
|
copyout(&ctx_id, args->ctxp, sizeof(ctx_id));
|
|
DPRINTF("returned context: %lx -> %p", (unsigned long)ctx_id, pctx);
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
/* Linux system call io_destroy(2) */
|
|
int linux_io_destroy(struct thread *td, struct linux_io_destroy_args *args)
|
|
{
|
|
int nerr = 0;
|
|
struct proc *p;
|
|
struct linux_aio_context *pctx;
|
|
struct linux_aio_request *preq, *ptmpreq;
|
|
struct aio_cancel_args cancelargs;
|
|
struct aio_return_args aioretargs;
|
|
|
|
DARGPRINTF("%lx", (unsigned long)args->ctx);
|
|
LINK_TO_NATIVE_AIO_MODULE();
|
|
|
|
p = td->td_proc;
|
|
|
|
/*
|
|
* Locking:
|
|
*
|
|
* LINUX_AIO_LOCK(p); <----------------+
|
|
* ... |
|
|
* LINUX_AIO_CTX_LIST_LOCK(); <--+ |
|
|
* ... | |
|
|
* LINUX_AIO_CTX_LIST_UNLOCK(); <--+ |
|
|
* ... |
|
|
* LINUX_AIO_CTX_LOCK(pctx); <---------|---+
|
|
* LINUX_AIO_UNLOCK(p); <----------------+ |
|
|
* ... |
|
|
* LINUX_AIO_CTX_UNLOCK(pctx); <-------------+
|
|
*/
|
|
|
|
LINUX_AIO_LOCK(p);
|
|
|
|
/* Find the context in context list */
|
|
LINUX_AIO_CTX_LIST_LOCK();
|
|
LINUX_AIO_CTX_FOREACH(pctx) {
|
|
if (LINUX_AIO_CTX_MATCH(pctx, args->ctx, p->p_pid))
|
|
break;
|
|
}
|
|
LINUX_AIO_CTX_LIST_UNLOCK();
|
|
|
|
/* Unable to find the context */
|
|
if (pctx == NULL) {
|
|
LINUX_AIO_UNLOCK(p);
|
|
return (EINVAL);
|
|
}
|
|
|
|
DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx, pctx);
|
|
|
|
/* Unhook the context from context list */
|
|
DPRINTF("Unhook context %p", pctx);
|
|
LINUX_AIO_CTX_UNHOOK(pctx);
|
|
|
|
LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */
|
|
LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */
|
|
|
|
/* Real cleanup */
|
|
LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) {
|
|
DPRINTF("Cancel request (Linux: %p, FreeBSD: %p)",
|
|
preq->req_porig, preq->req_pbsd);
|
|
|
|
/* Cancel FreeBSD native clone */
|
|
cancelargs.fd = preq->req_linux.aio_fildes;
|
|
cancelargs.aiocbp = preq->req_pbsd;
|
|
p_aio_cancel(td, &cancelargs);
|
|
DPRINTF("aio_cancel() returned %ld", (long)td->td_retval[0]);
|
|
if (td->td_retval[0] == AIO_NOTCANCELED)
|
|
printf(LMSG("linux_io_destroy(): Asynchronous IO "
|
|
"request (Linux: %p, FreeBSD: %p) "
|
|
"cannot be cancelled. "
|
|
"***** Both User Space "
|
|
"and Kernel Memory Leaked! *****"),
|
|
preq->req_porig, preq->req_pbsd);
|
|
|
|
LINUX_AIO_REQ_UNHOOK(pctx, preq);
|
|
|
|
if (td->td_retval[0] == AIO_ALLDONE) {
|
|
aioretargs.aiocbp = preq->req_pbsd;
|
|
p_aio_return(td, &aioretargs);
|
|
DPRINTF("aio_return(%p) returned %ld",
|
|
aioretargs.aiocbp,
|
|
(long)td->td_retval[0]);
|
|
|
|
td->td_retval[0] = AIO_ALLDONE;
|
|
}
|
|
|
|
/* Free user space clone of the request */
|
|
if (td->td_retval[0] != AIO_NOTCANCELED) /*
|
|
* XXX How to avoid
|
|
* memory leak here?
|
|
*/
|
|
user_free(td, preq->req_pbsd,
|
|
sizeof(*(preq->req_pbsd)));
|
|
|
|
/* Free kernel structure of the request */
|
|
uma_zfree(linux_aio_request_zone, preq);
|
|
|
|
td->td_retval[0] = 0;
|
|
}
|
|
|
|
LINUX_AIO_CTX_UNLOCK(pctx);
|
|
|
|
sx_destroy(&(pctx->ctx_sx));
|
|
|
|
/* Free the "ring" */
|
|
DPRINTF("free the \"ring\" %p", pctx->ctx_pring);
|
|
user_free(td, pctx->ctx_pring, sizeof(*pctx->ctx_pring));
|
|
|
|
/* Free destroyed context */
|
|
uma_zfree(linux_aio_context_zone, pctx);
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
/* Linux system call io_getevents(2) */
|
|
int linux_io_getevents(struct thread *td, struct linux_io_getevents_args *args)
|
|
{
|
|
int i, j, nerr = 0;
|
|
struct proc *p;
|
|
struct l_timespec l_timeout;
|
|
struct timespec timeout, *u_ptimeout, t1, t2;
|
|
struct linux_aio_context *pctx;
|
|
struct linux_aio_request *preq, *ptmpreq;
|
|
struct linux_io_event evt;
|
|
struct aio_return_args aioretargs;
|
|
struct aio_error_args aioerrargs;
|
|
register_t aio_ret, aio_err;
|
|
struct aiocb ** u_aiocbp;
|
|
struct aio_suspend_args aiosusargs;
|
|
|
|
DARGPRINTF("%lx, %ld, %ld, %p, %p",
|
|
(unsigned long) args->ctx_id,
|
|
(long)args->min_nr, (long)args->nr,
|
|
args->events, args->timeout);
|
|
LINK_TO_NATIVE_AIO_MODULE();
|
|
|
|
if (args->nr <= 0)
|
|
return (EINVAL);
|
|
|
|
if (args->min_nr < 0)
|
|
return (EINVAL);
|
|
|
|
nerr = user_mem_rw_verify(args->events,
|
|
sizeof(*(args->events)) * args->nr);
|
|
if (nerr != 0)
|
|
return (nerr);
|
|
|
|
if (args->timeout != NULL) {
|
|
nerr = copyin(args->timeout, &l_timeout, sizeof(l_timeout));
|
|
if (nerr != 0)
|
|
return (nerr);
|
|
timeout.tv_sec = l_timeout.tv_sec;
|
|
timeout.tv_nsec = l_timeout.tv_nsec;
|
|
DUMP_TIMESPEC("User specified timeout: ", &timeout, "");
|
|
}
|
|
|
|
p = td->td_proc;
|
|
|
|
/*
|
|
* Locking:
|
|
*
|
|
* LINUX_AIO_LOCK(p); <----------------+
|
|
* ... |
|
|
* LINUX_AIO_CTX_LIST_LOCK(); <--+ |
|
|
* ... | |
|
|
* LINUX_AIO_CTX_LIST_UNLOCK(); <--+ |
|
|
* ... |
|
|
* LINUX_AIO_CTX_LOCK(pctx); <---------|---+
|
|
* LINUX_AIO_UNLOCK(p); <----------------+ |
|
|
* ... |
|
|
* LINUX_AIO_CTX_UNLOCK(pctx); <-------------+
|
|
*/
|
|
|
|
LINUX_AIO_LOCK(p);
|
|
|
|
/* Find the context in context list */
|
|
LINUX_AIO_CTX_LIST_LOCK();
|
|
LINUX_AIO_CTX_FOREACH(pctx) {
|
|
if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid))
|
|
break;
|
|
}
|
|
LINUX_AIO_CTX_LIST_UNLOCK();
|
|
|
|
/* Unable to find the context */
|
|
if (pctx == NULL) {
|
|
LINUX_AIO_UNLOCK(p);
|
|
return (EINVAL);
|
|
}
|
|
|
|
DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx);
|
|
|
|
LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */
|
|
LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */
|
|
|
|
if (STAILQ_EMPTY(&(pctx->ctx_req))) {
|
|
td->td_retval[0] = 0; /* No queued request */
|
|
DPRINTF("No request in queue (context: %p) at all, "
|
|
"return directly", pctx);
|
|
} else { /* Deal with the request queue */
|
|
i = 0; /*
|
|
* This variable's value will be the return value
|
|
* of linux_io_getevents()
|
|
*/
|
|
|
|
nerr = user_malloc(td, (void **)&u_aiocbp,
|
|
sizeof(*u_aiocbp) * pctx->ctx_nreq_max);
|
|
if (nerr != 0)
|
|
goto skip_substantial_0;
|
|
|
|
nerr = user_malloc(td, (void **)&u_ptimeout,
|
|
sizeof(*u_ptimeout));
|
|
if (nerr != 0)
|
|
goto skip_substantial_1;
|
|
|
|
for (i = 0;i < args->nr;) {
|
|
|
|
/* Collecting finished requests and waiting for queued requests */
|
|
|
|
LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) {
|
|
|
|
/* Collect all finished requests */
|
|
|
|
if (i >= args->nr) /* Full */
|
|
break;
|
|
|
|
aioerrargs.aiocbp = preq->req_pbsd;
|
|
p_aio_error(td, &aioerrargs);
|
|
aio_ret = td->td_retval[0];
|
|
td->td_retval[0] = 0;
|
|
|
|
DPRINTF("aio_error(%p) (Linux: %p) "
|
|
"returned %ld%s",
|
|
aioerrargs.aiocbp,
|
|
preq->req_porig,
|
|
(long)aio_ret,
|
|
aio_ret == EINPROGRESS ?
|
|
"(EINPROGRESS)" : "" );
|
|
|
|
if (aio_ret == EINPROGRESS)
|
|
continue;
|
|
|
|
/* Done */
|
|
LINUX_AIO_REQ_UNHOOK(pctx, preq);
|
|
|
|
aioretargs.aiocbp = preq->req_pbsd;
|
|
aio_err = p_aio_return(td, &aioretargs);
|
|
aio_ret = td->td_retval[0];
|
|
td->td_retval[0] = 0;
|
|
|
|
DPRINTF("aio_return(%p) (Linux: %p) "
|
|
"returned %ld, errno=%ld",
|
|
aioretargs.aiocbp,
|
|
preq->req_porig,
|
|
(long)aio_ret,
|
|
(long)aio_err);
|
|
|
|
evt.data = preq->req_linux.aio_data;
|
|
evt.obj = (uint64_t)(unsigned long)
|
|
preq->req_porig;
|
|
if (aio_ret >= 0) {
|
|
/* Normal return (success) */
|
|
evt.res = aio_ret;
|
|
} else { /* Error code (failure) */
|
|
/*
|
|
* Translate FreeBSD error code
|
|
* to Linux's
|
|
*/
|
|
evt.res =
|
|
p->p_sysent->sv_errtbl[aio_err];
|
|
}
|
|
DPRINTF("context %p (Linux: %p): "
|
|
"io_event.res=%lld",
|
|
preq->req_pbsd,
|
|
preq->req_porig,
|
|
(long long)evt.res);
|
|
evt.res2 = 0;
|
|
|
|
copyout(&evt, &(args->events[i]), sizeof(evt));
|
|
|
|
uma_zfree(linux_aio_request_zone, preq);
|
|
|
|
i ++;
|
|
} /* End of collecting all finished requests */
|
|
|
|
if (STAILQ_EMPTY(&(pctx->ctx_req))) {
|
|
/* No request remained in this context */
|
|
DPRINTF("returning(context %p): "
|
|
"request queue is empty",
|
|
pctx);
|
|
break;
|
|
}
|
|
|
|
if (i >= args->nr) { /* Full */
|
|
DPRINTF("returning(context %p): user space "
|
|
"event array is full",
|
|
pctx);
|
|
break;
|
|
}
|
|
|
|
if (i >= args->min_nr) {
|
|
/* Met the minimum requirement */
|
|
DPRINTF("returning(context %p): "
|
|
"met the minimum requirement",
|
|
pctx);
|
|
break;
|
|
}
|
|
|
|
if (args->timeout != NULL) {
|
|
if (! timespecisset(&timeout)) { /* Timed out */
|
|
DPRINTF("returning(context %p): "
|
|
"no time remaining",
|
|
pctx);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (args->timeout != NULL) {
|
|
nanouptime(&t1); /* Time before aio_suspend() */
|
|
DUMP_TIMESPEC("T1: ", &t1,
|
|
" (uptime before calling aio_suspend())");
|
|
}
|
|
|
|
/* Prepare arguments for aio_suspend() */
|
|
j = 0;
|
|
LINUX_AIO_REQ_FOREACH(pctx, preq) {
|
|
copyout(&(preq->req_pbsd), &(u_aiocbp[j]),
|
|
sizeof(preq->req_pbsd));
|
|
j++;
|
|
}
|
|
MPASS(j == pctx->ctx_nreq_cur);
|
|
aiosusargs.aiocbp = u_aiocbp;
|
|
aiosusargs.nent = j;
|
|
|
|
if (args->timeout != NULL) {
|
|
copyout(&timeout, u_ptimeout, sizeof(timeout));
|
|
aiosusargs.timeout = u_ptimeout;
|
|
DUMP_TIMESPEC("Time remained: ", &timeout, "");
|
|
} else {
|
|
aiosusargs.timeout = NULL;
|
|
}
|
|
|
|
aio_err = p_aio_suspend(td, &aiosusargs);
|
|
DPRINTF("aio_suspend(%p, %d, %p) returned %ld",
|
|
aiosusargs.aiocbp, aiosusargs.nent,
|
|
aiosusargs.timeout, (long)aio_err);
|
|
|
|
if (args->timeout != NULL) {
|
|
nanouptime(&t2); /* Time after aio_suspend() */
|
|
DUMP_TIMESPEC("T2: ", &t2,
|
|
" (uptime after calling aio_suspend())");
|
|
timespecsub(&t2, &t1); /*
|
|
* Time spent by
|
|
* aio_suspend()
|
|
*/
|
|
DUMP_TIMESPEC("T_delta: ", &t2,
|
|
" (time spent by calling aio_suspend())");
|
|
if (timespeccmp(&t2, &timeout, >=)) {
|
|
timespecclear(&timeout); /* Timed out */
|
|
} else {
|
|
timespecsub(&timeout, &t2);
|
|
/* Time remaining */
|
|
}
|
|
DUMP_TIMESPEC("Time remained: ", &timeout, "");
|
|
}
|
|
|
|
if (aio_err == EAGAIN) { /* Timed out */
|
|
DPRINTF("returning(context %p): "
|
|
"timed out after calling aio_suspend()",
|
|
pctx);
|
|
break;
|
|
}
|
|
} /*
|
|
* End of collecting finished requests
|
|
* and waiting for queued requests
|
|
*/
|
|
|
|
l_timeout.tv_sec = timeout.tv_sec;
|
|
l_timeout.tv_nsec = timeout.tv_nsec;
|
|
copyout(&l_timeout, args->timeout, sizeof(l_timeout));
|
|
/* No matter whether successfully or not */
|
|
|
|
nerr = user_free(td, u_ptimeout, sizeof(*u_ptimeout));
|
|
skip_substantial_1:
|
|
nerr = user_free(td, u_aiocbp,
|
|
sizeof(*u_aiocbp) * pctx->ctx_nreq_max);
|
|
skip_substantial_0:
|
|
td->td_retval[0] = i;
|
|
/* user_free() resets td->td_retval[0] to 0 */
|
|
DPRINTF("%d requests are unhooked from the context %p", i, pctx);
|
|
} /* End of dealing with request queue */
|
|
|
|
LINUX_AIO_CTX_UNLOCK(pctx);
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
/* Linux system call io_submit(2) */
|
|
int linux_io_submit(struct thread *td, struct linux_io_submit_args *args)
|
|
{
|
|
int i, nerr = 0;
|
|
struct proc *p;
|
|
struct linux_aio_context *pctx;
|
|
struct linux_aio_request req, *preq;
|
|
struct linux_iocb *porig;
|
|
struct aiocb iocb, *piocb;
|
|
|
|
DARGPRINTF("%lx, %ld, %p", (unsigned long)args->ctx_id,
|
|
(long)args->nr, args->iocbpp);
|
|
LINK_TO_NATIVE_AIO_MODULE();
|
|
|
|
if (args->nr <= 0)
|
|
return (EINVAL);
|
|
|
|
p = td->td_proc;
|
|
|
|
/*
|
|
* Locking:
|
|
*
|
|
* LINUX_AIO_LOCK(p); <----------------+
|
|
* ... |
|
|
* LINUX_AIO_CTX_LIST_LOCK(); <--+ |
|
|
* ... | |
|
|
* LINUX_AIO_CTX_LIST_UNLOCK(); <--+ |
|
|
* ... |
|
|
* LINUX_AIO_CTX_LOCK(pctx); <---------|---+
|
|
* LINUX_AIO_UNLOCK(p); <----------------+ |
|
|
* ... |
|
|
* LINUX_AIO_CTX_UNLOCK(pctx); <-------------+
|
|
*/
|
|
|
|
LINUX_AIO_LOCK(p);
|
|
|
|
/* Find the context in context list */
|
|
LINUX_AIO_CTX_LIST_LOCK();
|
|
LINUX_AIO_CTX_FOREACH(pctx) {
|
|
if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid))
|
|
break;
|
|
}
|
|
LINUX_AIO_CTX_LIST_UNLOCK();
|
|
|
|
/* Unable to find the context */
|
|
if (pctx == NULL) {
|
|
LINUX_AIO_UNLOCK(p);
|
|
return (EINVAL);
|
|
}
|
|
|
|
DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx);
|
|
|
|
LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */
|
|
LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */
|
|
|
|
for (i = 0; pctx->ctx_nreq_cur < pctx->ctx_nreq_max && i < args->nr;
|
|
i++) {
|
|
/* Get user space Linux control block */
|
|
nerr = copyin(&(args->iocbpp[i]), &porig, sizeof(porig));
|
|
if (nerr != 0)
|
|
break;
|
|
nerr = copyin(porig, &(req.req_linux), sizeof(req.req_linux));
|
|
if (nerr != 0)
|
|
break;
|
|
|
|
/* Create user space FreeBSD control block clone */
|
|
nerr = iocb_reformat(&(req.req_linux), &iocb);
|
|
if (nerr != 0)
|
|
break;
|
|
nerr = user_malloc(td, (void **)&piocb, sizeof(*piocb));
|
|
if (nerr != 0)
|
|
break;
|
|
nerr = copyout(&iocb, piocb, sizeof(iocb));
|
|
if (nerr != 0)
|
|
break;
|
|
DUMP_FREEBSD_AIOCB(piocb, 1);
|
|
|
|
/* Submit user space control block */
|
|
nerr = p_aio_aqueue(td, piocb, NULL, iocb.aio_lio_opcode, 0);
|
|
if (nerr != 0) {
|
|
user_free(td, piocb, sizeof(*piocb));
|
|
break;
|
|
}
|
|
|
|
req.req_porig = porig;
|
|
req.req_pbsd = piocb;
|
|
|
|
/* Hook request to the context */
|
|
preq = uma_zalloc(linux_aio_request_zone, M_WAITOK);
|
|
memcpy(preq, &req, sizeof(req));
|
|
DPRINTF("Linux IOCB %p (aio_lio_opcode=%u, aio_fildes=%u), "
|
|
"FreeBSD IOCB %p",
|
|
preq->req_porig,
|
|
(unsigned)preq->req_linux.aio_lio_opcode,
|
|
(unsigned)preq->req_linux.aio_fildes,
|
|
preq->req_pbsd);
|
|
LINUX_AIO_REQ_HOOK(pctx, preq);
|
|
}
|
|
|
|
LINUX_AIO_CTX_UNLOCK(pctx);
|
|
|
|
if (i > 0) {
|
|
td->td_retval[0] = i;
|
|
nerr = 0;
|
|
}
|
|
|
|
if (i == 0 && nerr == 0)
|
|
nerr = EAGAIN; /* No request is successfully submitted */
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
/* Linux system call io_cancel(2) */
|
|
int linux_io_cancel(struct thread *td, struct linux_io_cancel_args *args)
|
|
{
|
|
int nerr = 0;
|
|
struct proc *p;
|
|
struct linux_iocb lcb;
|
|
struct linux_aio_context *pctx;
|
|
struct linux_aio_request *preq;
|
|
struct linux_io_event evt;
|
|
struct aio_cancel_args aiocnclargs;
|
|
|
|
DARGPRINTF("%lx, %p, %p", (unsigned long)args->ctx_id,
|
|
args->iocb, args->result);
|
|
LINK_TO_NATIVE_AIO_MODULE();
|
|
|
|
nerr = copyin(args->iocb, &lcb, sizeof(lcb));
|
|
if (nerr != 0)
|
|
return (nerr);
|
|
|
|
nerr = user_mem_rw_verify(args->result, sizeof(*(args->result)));
|
|
if (nerr != 0)
|
|
return (nerr);
|
|
|
|
p = td->td_proc;
|
|
|
|
/*
|
|
* Locking:
|
|
*
|
|
* LINUX_AIO_LOCK(p); <----------------+
|
|
* ... |
|
|
* LINUX_AIO_CTX_LIST_LOCK(); <--+ |
|
|
* ... | |
|
|
* LINUX_AIO_CTX_LIST_UNLOCK(); <--+ |
|
|
* ... |
|
|
* LINUX_AIO_CTX_LOCK(pctx); <---------|---+
|
|
* LINUX_AIO_UNLOCK(p); <----------------+ |
|
|
* ... |
|
|
* LINUX_AIO_CTX_UNLOCK(pctx); <-------------+
|
|
*/
|
|
|
|
LINUX_AIO_LOCK(p);
|
|
|
|
/* Find the context in context list */
|
|
LINUX_AIO_CTX_LIST_LOCK();
|
|
LINUX_AIO_CTX_FOREACH(pctx) {
|
|
if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid))
|
|
break;
|
|
}
|
|
LINUX_AIO_CTX_LIST_UNLOCK();
|
|
|
|
/* Unable to find the context */
|
|
if (pctx == NULL) {
|
|
LINUX_AIO_UNLOCK(p);
|
|
return (EINVAL);
|
|
}
|
|
|
|
DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx);
|
|
|
|
LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */
|
|
LINUX_AIO_UNLOCK(p); /* XXX Interlaced, seamless */
|
|
|
|
LINUX_AIO_REQ_FOREACH(pctx, preq) {
|
|
if (preq->req_porig == args->iocb
|
|
&& preq->req_linux.aio_key == lcb.aio_key)
|
|
break;
|
|
}
|
|
|
|
if (preq == NULL) {
|
|
DPRINTF("Unable to find IO control block %p", args->iocb);
|
|
nerr = EINVAL;
|
|
} else { /* Found the request in context */
|
|
DPRINTF("Cancel request (Linux: %p, FreeBSD: %p)",
|
|
preq->req_porig, preq->req_pbsd);
|
|
|
|
/* Cancel FreeBSD native clone */
|
|
aiocnclargs.fd = preq->req_linux.aio_fildes;
|
|
aiocnclargs.aiocbp = preq->req_pbsd;
|
|
p_aio_cancel(td, &aiocnclargs);
|
|
DPRINTF("aio_cancel() returned %ld", (long)td->td_retval[0]);
|
|
|
|
if (td->td_retval[0] == AIO_CANCELED) {
|
|
/* Cancellation succeeded */
|
|
LINUX_AIO_REQ_UNHOOK(pctx, preq);
|
|
|
|
evt.data = preq->req_linux.aio_data;
|
|
evt.obj = (uint64_t)(unsigned long) preq->req_porig;
|
|
evt.res = p->p_sysent->sv_errtbl[ECANCELED];
|
|
evt.res2 = 0;
|
|
|
|
/* Fill in user space structure linux_io_event */
|
|
copyout(&evt, args->result, sizeof(evt));
|
|
|
|
/* Free user space clone of the request */
|
|
user_free(td, preq->req_pbsd,
|
|
sizeof(*(preq->req_pbsd)));
|
|
|
|
/* Free kernel structure of the request */
|
|
uma_zfree(linux_aio_request_zone, preq);
|
|
} else if (td->td_retval[0] == AIO_ALLDONE) {
|
|
nerr = EINVAL; /*
|
|
* This value of Linux 2.6.15
|
|
* is really confusing !!!
|
|
*/
|
|
} else { /* AIO_NOTCANCELED */
|
|
nerr = EAGAIN;
|
|
}
|
|
|
|
td->td_retval[0] = 0;
|
|
}
|
|
|
|
LINUX_AIO_CTX_UNLOCK(pctx);
|
|
|
|
return (nerr);
|
|
}
|
|
|
|
static void linux_aio_proc_rundown(void *arg, struct proc *p)
|
|
{
|
|
struct linux_aio_context *pctx, *ptmpctx;
|
|
struct linux_aio_request *preq, *ptmpreq;
|
|
|
|
/*
|
|
* FreeBSD module "aio" can do more essential native cleanup
|
|
* (i.e. cancelling all queued requests) itself.
|
|
*/
|
|
|
|
LINUX_AIO_CTX_LIST_LOCK();
|
|
|
|
LINUX_AIO_CTX_FOREACH_SAFE(pctx, ptmpctx) {
|
|
if (pctx->ctx_pid == p->p_pid) {
|
|
LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) {
|
|
DPPRINTF("Free request %p from context %p "
|
|
"(ring: %p)",
|
|
preq, pctx, pctx->ctx_pring);
|
|
LINUX_AIO_REQ_UNHOOK(pctx, preq);
|
|
uma_zfree(linux_aio_request_zone, preq);
|
|
}
|
|
|
|
DPPRINTF("Free context %p (ring: %p)",
|
|
pctx, pctx->ctx_pring);
|
|
|
|
/* Unhook it from context list */
|
|
LINUX_AIO_CTX_UNHOOK(pctx);
|
|
|
|
/* Free it really */
|
|
sx_destroy(&(pctx->ctx_sx));
|
|
uma_zfree(linux_aio_context_zone, pctx);
|
|
|
|
DPPRINTF("The remaining context list is %s",
|
|
(SLIST_EMPTY(&linux_aio_context_list) ?
|
|
"empty":"not empty"));
|
|
}
|
|
}
|
|
|
|
LINUX_AIO_CTX_LIST_UNLOCK();
|
|
}
|
|
|
|
/*
|
|
* Module constructor/destructor
|
|
*/
|
|
static int
|
|
linux_aio_modload(struct module *module, int cmd, void *arg)
|
|
{
|
|
int nerr = 0;
|
|
|
|
switch (cmd) {
|
|
case MOD_LOAD:
|
|
linux_aio_context_zone = uma_zcreate("LINUXAIOCTX",
|
|
sizeof(struct linux_aio_context),
|
|
NULL, NULL, NULL, NULL,
|
|
UMA_ALIGN_PTR, 0);
|
|
linux_aio_request_zone = uma_zcreate("LINUXAIOREQ",
|
|
sizeof(struct linux_aio_request),
|
|
NULL, NULL, NULL, NULL,
|
|
UMA_ALIGN_PTR, 0);
|
|
mtx_init(&linux_aio_context_list_mtx,
|
|
"linux_aio_context_list", NULL, MTX_DEF);
|
|
SLIST_INIT(&linux_aio_context_list);
|
|
linux_aio_exit_tag = EVENTHANDLER_REGISTER(process_exit,
|
|
linux_aio_proc_rundown,
|
|
NULL, EVENTHANDLER_PRI_ANY);
|
|
break;
|
|
case MOD_UNLOAD:
|
|
LINUX_AIO_CTX_LIST_LOCK();
|
|
if (!SLIST_EMPTY(&linux_aio_context_list)) {
|
|
nerr = EBUSY;
|
|
LINUX_AIO_CTX_LIST_UNLOCK();
|
|
break;
|
|
}
|
|
EVENTHANDLER_DEREGISTER(process_exit, linux_aio_exit_tag);
|
|
LINUX_AIO_CTX_LIST_UNLOCK();
|
|
mtx_destroy(&linux_aio_context_list_mtx);
|
|
uma_zdestroy(linux_aio_request_zone);
|
|
uma_zdestroy(linux_aio_context_zone);
|
|
if (native_aio_module_handle != NULL) {
|
|
/*
|
|
* linker_release_module() cannot be used here.
|
|
* It tries to hold "kld_sx", conflicting against
|
|
* module_unload().
|
|
*/
|
|
linker_file_unload(native_aio_module_handle,
|
|
LINKER_UNLOAD_NORMAL);
|
|
}
|
|
break;
|
|
case MOD_SHUTDOWN:
|
|
break;
|
|
default:
|
|
nerr = EINVAL;
|
|
break;
|
|
}
|
|
return (nerr);
|
|
}
|
|
|
|
static moduledata_t linux_aio_mod = {
|
|
"linuxaio",
|
|
&linux_aio_modload,
|
|
NULL
|
|
};
|
|
|
|
DECLARE_MODULE(linuxaio, linux_aio_mod, SI_SUB_VFS, SI_ORDER_ANY);
|