Implement sendfile(2) for the posix shared memory segment file descriptor,

in addition to the regular files.

Requested by:	alc
Discussed with:	emaste
Tested by:	pho (previous version)
Sponsored by:	The FreeBSD Foundation
Approved by:	re (hrs)
This commit is contained in:
Konstantin Belousov 2013-09-11 06:41:15 +00:00
parent ed03145ec8
commit 227aaa86ed
2 changed files with 251 additions and 127 deletions

View file

@ -134,7 +134,7 @@ static struct fileops shm_ops = {
.fo_close = shm_close,
.fo_chmod = shm_chmod,
.fo_chown = shm_chown,
.fo_sendfile = invfo_sendfile,
.fo_sendfile = vn_sendfile,
.fo_seek = shm_seek,
.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
};

View file

@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/capability.h>
#include <sys/condvar.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@ -57,6 +58,7 @@ __FBSDID("$FreeBSD$");
#include <sys/file.h>
#include <sys/filio.h>
#include <sys/jail.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
@ -86,7 +88,7 @@ __FBSDID("$FreeBSD$");
#include <vm/vm_param.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
@ -1850,8 +1852,6 @@ getsockaddr(namp, uaddr, len)
return (error);
}
#include <sys/condvar.h>
struct sendfile_sync {
struct mtx mtx;
struct cv cv;
@ -1917,6 +1917,10 @@ do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
cap_rights_t rights;
int error;
/*
* File offset must be positive. If it goes beyond EOF
* we send only the header/trailer and no payload data.
*/
if (uap->offset < 0)
return (EINVAL);
@ -1978,79 +1982,240 @@ freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
}
#endif /* COMPAT_FREEBSD4 */
static int
sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd,
off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res)
{
vm_page_t m;
vm_pindex_t pindex;
ssize_t resid;
int error, readahead, rv;
pindex = OFF_TO_IDX(off);
VM_OBJECT_WLOCK(obj);
m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY |
VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
/*
* Check if page is valid for what we need, otherwise initiate I/O.
*
* The non-zero nd argument prevents disk I/O, instead we
* return the caller what he specified in nd. In particular,
* if we already turned some pages into mbufs, nd == EAGAIN
* and the main function send them the pages before we come
* here again and block.
*/
if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) {
if (vp == NULL)
vm_page_xunbusy(m);
VM_OBJECT_WUNLOCK(obj);
*res = m;
return (0);
} else if (nd != 0) {
if (vp == NULL)
vm_page_xunbusy(m);
error = nd;
goto free_page;
}
/*
* Get the page from backing store.
*/
error = 0;
if (vp != NULL) {
VM_OBJECT_WUNLOCK(obj);
readahead = sfreadahead * MAXBSIZE;
/*
* Use vn_rdwr() instead of the pager interface for
* the vnode, to allow the read-ahead.
*
* XXXMAC: Because we don't have fp->f_cred here, we
* pass in NOCRED. This is probably wrong, but is
* consistent with our original implementation.
*/
error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off),
UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead /
bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td);
SFSTAT_INC(sf_iocnt);
VM_OBJECT_WLOCK(obj);
} else {
if (vm_pager_has_page(obj, pindex, NULL, NULL)) {
rv = vm_pager_get_pages(obj, &m, 1, 0);
SFSTAT_INC(sf_iocnt);
m = vm_page_lookup(obj, pindex);
if (m == NULL)
error = EIO;
else if (rv != VM_PAGER_OK) {
vm_page_lock(m);
vm_page_free(m);
vm_page_unlock(m);
m = NULL;
error = EIO;
}
} else {
pmap_zero_page(m);
m->valid = VM_PAGE_BITS_ALL;
m->dirty = 0;
}
if (m != NULL)
vm_page_xunbusy(m);
}
if (error == 0) {
*res = m;
} else if (m != NULL) {
free_page:
vm_page_lock(m);
vm_page_unwire(m, 0);
/*
* See if anyone else might know about this page. If
* not and it is not valid, then free it.
*/
if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m))
vm_page_free(m);
vm_page_unlock(m);
}
VM_OBJECT_WUNLOCK(obj);
KASSERT(error != 0 || (m->wire_count > 0 && m->valid ==
VM_PAGE_BITS_ALL),
("wrong page state m %p", m));
return (error);
}
static int
sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
int *bsize)
{
struct vattr va;
vm_object_t obj;
struct vnode *vp;
struct shmfd *shmfd;
int error;
vp = *vp_res = NULL;
obj = NULL;
shmfd = *shmfd_res = NULL;
*bsize = 0;
/*
* The file descriptor must be a regular file and have a
* backing VM object.
*/
if (fp->f_type == DTYPE_VNODE) {
vp = fp->f_vnode;
vn_lock(vp, LK_SHARED | LK_RETRY);
if (vp->v_type != VREG) {
error = EINVAL;
goto out;
}
*bsize = vp->v_mount->mnt_stat.f_iosize;
error = VOP_GETATTR(vp, &va, td->td_ucred);
if (error != 0)
goto out;
*obj_size = va.va_size;
obj = vp->v_object;
if (obj == NULL) {
error = EINVAL;
goto out;
}
} else if (fp->f_type == DTYPE_SHM) {
shmfd = fp->f_data;
obj = shmfd->shm_object;
*obj_size = shmfd->shm_size;
} else {
error = EINVAL;
goto out;
}
VM_OBJECT_WLOCK(obj);
if ((obj->flags & OBJ_DEAD) != 0) {
VM_OBJECT_WUNLOCK(obj);
error = EBADF;
goto out;
}
/*
* Temporarily increase the backing VM object's reference
* count so that a forced reclamation of its vnode does not
* immediately destroy it.
*/
vm_object_reference_locked(obj);
VM_OBJECT_WUNLOCK(obj);
*obj_res = obj;
*vp_res = vp;
*shmfd_res = shmfd;
out:
if (vp != NULL)
VOP_UNLOCK(vp, 0);
return (error);
}
static int
kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
struct socket **so)
{
cap_rights_t rights;
int error;
*sock_fp = NULL;
*so = NULL;
/*
* The socket must be a stream socket and connected.
*/
error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights,
CAP_SEND), sock_fp, NULL);
if (error != 0)
return (error);
*so = (*sock_fp)->f_data;
if ((*so)->so_type != SOCK_STREAM)
return (EINVAL);
if (((*so)->so_state & SS_ISCONNECTED) == 0)
return (ENOTCONN);
return (0);
}
int
vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
int kflags, struct thread *td)
{
struct vnode *vp = fp->f_vnode;
struct file *sock_fp;
struct vm_object *obj = NULL;
struct socket *so = NULL;
struct mbuf *m = NULL;
struct vnode *vp;
struct vm_object *obj;
struct socket *so;
struct mbuf *m;
struct sf_buf *sf;
struct vm_page *pg;
struct shmfd *shmfd;
struct sendfile_sync *sfs;
struct vattr va;
struct sendfile_sync *sfs = NULL;
cap_rights_t rights;
off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
int bsize, error, hdrlen = 0, mnw = 0;
off_t off, xfsize, fsbytes, sbytes, rem, obj_size;
int error, bsize, nd, hdrlen, mnw;
bool inflight_called;
vn_lock(vp, LK_SHARED | LK_RETRY);
if (vp->v_type == VREG) {
bsize = vp->v_mount->mnt_stat.f_iosize;
if (nbytes == 0) {
error = VOP_GETATTR(vp, &va, td->td_ucred);
if (error != 0) {
VOP_UNLOCK(vp, 0);
obj = NULL;
goto out;
}
rem = va.va_size;
} else
rem = nbytes;
obj = vp->v_object;
if (obj != NULL) {
/*
* Temporarily increase the backing VM
* object's reference count so that a forced
* reclamation of its vnode does not
* immediately destroy it.
*/
VM_OBJECT_WLOCK(obj);
if ((obj->flags & OBJ_DEAD) == 0) {
vm_object_reference_locked(obj);
VM_OBJECT_WUNLOCK(obj);
} else {
VM_OBJECT_WUNLOCK(obj);
obj = NULL;
}
}
} else
bsize = 0; /* silence gcc */
VOP_UNLOCK(vp, 0);
if (obj == NULL) {
error = EINVAL;
goto out;
}
obj = NULL;
so = NULL;
m = NULL;
sfs = NULL;
fsbytes = sbytes = 0;
hdrlen = mnw = 0;
rem = nbytes;
inflight_called = false;
/*
* The socket must be a stream socket and connected.
* Remember if it a blocking or non-blocking socket.
*/
error = getsock_cap(td->td_proc->p_fd, sockfd,
cap_rights_init(&rights, CAP_SEND), &sock_fp, NULL);
error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
if (error != 0)
return (error);
if (rem == 0)
rem = obj_size;
error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
if (error != 0)
goto out;
so = sock_fp->f_data;
if (so->so_type != SOCK_STREAM) {
error = EINVAL;
goto out;
}
if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
goto out;
}
/*
* Do not wait on memory allocations but return ENOMEM for
* caller to retry later.
@ -2123,7 +2288,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
int done;
if ((nbytes != 0 && nbytes == fsbytes) ||
(nbytes == 0 && va.va_size == fsbytes))
(nbytes == 0 && obj_size == fsbytes))
break;
mtail = NULL;
@ -2197,13 +2362,16 @@ retry_space:
*/
space -= hdrlen;
error = vn_lock(vp, LK_SHARED);
if (error != 0)
goto done;
error = VOP_GETATTR(vp, &va, td->td_ucred);
if (error != 0 || off >= va.va_size) {
VOP_UNLOCK(vp, 0);
goto done;
if (vp != NULL) {
error = vn_lock(vp, LK_SHARED);
if (error != 0)
goto done;
error = VOP_GETATTR(vp, &va, td->td_ucred);
if (error != 0 || off >= va.va_size) {
VOP_UNLOCK(vp, 0);
goto done;
}
obj_size = va.va_size;
}
/*
@ -2211,7 +2379,6 @@ retry_space:
* dumped into socket buffer.
*/
while (space > loopbytes) {
vm_pindex_t pindex;
vm_offset_t pgoff;
struct mbuf *m0;
@ -2221,7 +2388,7 @@ retry_space:
* or the passed in nbytes.
*/
pgoff = (vm_offset_t)(off & PAGE_MASK);
rem = va.va_size - offset;
rem = obj_size - offset;
if (nbytes != 0)
rem = omin(rem, nbytes);
rem -= fsbytes + loopbytes;
@ -2236,59 +2403,15 @@ retry_space:
* Attempt to look up the page. Allocate
* if not found or wait and loop if busy.
*/
pindex = OFF_TO_IDX(off);
VM_OBJECT_WLOCK(obj);
pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
VM_ALLOC_IGN_SBUSY | VM_ALLOC_NORMAL |
VM_ALLOC_WIRED);
/*
* Check if page is valid for what we need,
* otherwise initiate I/O.
* If we already turned some pages into mbufs,
* send them off before we come here again and
* block.
*/
if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
VM_OBJECT_WUNLOCK(obj);
else if (m != NULL)
error = EAGAIN; /* send what we already got */
else if (flags & SF_NODISKIO)
error = EBUSY;
else {
ssize_t resid;
int readahead = sfreadahead * MAXBSIZE;
VM_OBJECT_WUNLOCK(obj);
/*
* Get the page from backing store.
* XXXMAC: Because we don't have fp->f_cred
* here, we pass in NOCRED. This is probably
* wrong, but is consistent with our original
* implementation.
*/
error = vn_rdwr(UIO_READ, vp, NULL, readahead,
trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
IO_VMIO | ((readahead / bsize) << IO_SEQSHIFT),
td->td_ucred, NOCRED, &resid, td);
SFSTAT_INC(sf_iocnt);
if (error != 0)
VM_OBJECT_WLOCK(obj);
}
if (m != NULL)
nd = EAGAIN; /* send what we already got */
else if ((flags & SF_NODISKIO) != 0)
nd = EBUSY;
else
nd = 0;
error = sendfile_readpage(obj, vp, nd, off,
xfsize, bsize, td, &pg);
if (error != 0) {
vm_page_lock(pg);
vm_page_unwire(pg, 0);
/*
* See if anyone else might know about
* this page. If not and it is not valid,
* then free it.
*/
if (pg->wire_count == 0 && pg->valid == 0 &&
!vm_page_busied(pg))
vm_page_free(pg);
vm_page_unlock(pg);
VM_OBJECT_WUNLOCK(obj);
if (error == EAGAIN)
error = 0; /* not a real error */
break;
@ -2358,7 +2481,8 @@ retry_space:
}
}
VOP_UNLOCK(vp, 0);
if (vp != NULL)
VOP_UNLOCK(vp, 0);
/* Add the buffer chain to the socket buffer. */
if (m != NULL) {